diff options
-rw-r--r-- | benchmark.py | 99 | ||||
-rwxr-xr-x | camera.py | 34 | ||||
-rw-r--r-- | event.py | 25 | ||||
-rw-r--r-- | gpu.py | 84 |
4 files changed, 178 insertions, 64 deletions
diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..0f72811 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,99 @@ +import numpy as np +from pycuda import gpuarray as ga +import time +from uncertainties import ufloat +import sys + +from chroma.gpu import GPU, to_float3 +from chroma.camera import get_rays +from chroma.event import Photons +from chroma.sample import uniform_sphere + +def progress(seq): + "Print progress while iterating over `seq`." + n = len(seq) + print '[' + ' '*21 + ']\r[', + sys.stdout.flush() + for i, item in enumerate(seq): + if i % (n//10) == 0: + print '.', + sys.stdout.flush() + yield item + print ']' + sys.stdout.flush() + +def ray_trace(gpu, number=1000): + """ + Return the number of mean and standard deviation of the number of ray + intersections per second as a ufloat for the geometry loaded onto `gpu`. + + .. note:: + The rays are thrown from a camera sitting *outside* of the geometry. + + Args: + - gpu, chroma.gpu.GPU + The GPU object with a geometry already loaded. + - number, int + The number of kernel calls to average. + """ + lb, ub = gpu.geometry.mesh.get_bounds() + scale = np.linalg.norm(ub-lb) + point = [0,scale,(lb[2]+ub[2])/2] + + size = (800,600) + width, height = size + + origins, directions = get_rays(point, size, 0.035, focal_length=0.018) + + origins_gpu = ga.to_gpu(to_float3(origins)) + directions_gpu = ga.to_gpu(to_float3(directions)) + pixels_gpu = ga.zeros(width*height, dtype=np.int32) + + run_times = [] + for i in progress(range(number)): + t0 = time.time() + gpu.kernels.ray_trace(np.int32(pixels_gpu.size), origins_gpu, directions_gpu, pixels_gpu, block=(gpu.nthreads_per_block,1,1), grid=(pixels_gpu.size//gpu.nthreads_per_block+1,1)) + gpu.context.synchronize() + elapsed = time.time() - t0 + run_times.append(elapsed) + + return pixels_gpu.size/ufloat((np.mean(run_times),np.std(run_times))) + +def propagate(gpu, number=10, nphotons=500000): + """ + Return the mean and standard deviation of the number of photons propagated + per second as a ufloat for the geometry loaded onto `gpu`. + + Args: + - gpu, chroma.gpu.GPU + The GPU object with a geometry already loaded. + - number, int + The number of kernel calls to average. + - nphotons, int + The number of photons to propagate per kernel call. + """ + gpu.setup_propagate() + + run_times = [] + for i in progress(range(number)): + photons = Photons(np.zeros((nphotons,3)), uniform_sphere(nphotons), np.random.uniform(400,800,size=nphotons)) + gpu.load_photons(photons) + t0 = time.time() + gpu.propagate() + gpu.context.synchronize() + elapsed = time.time() - t0 + run_times.append(elapsed) + + return nphotons/ufloat((np.mean(run_times),np.std(run_times))) + +if __name__ == '__main__': + from chroma.detectors import build_lbne_200kton, build_minilbne + + lbne = build_lbne_200kton() + lbne.build(bits=11) + + gpu = GPU() + gpu.load_geometry(lbne, print_usage=False) + + print '%s track steps/s' % ray_trace(gpu) + print '%s photons/s' % propagate(gpu) @@ -136,8 +136,6 @@ class Camera(Thread): self.gpu = GPU(self.device_id) self.gpu.load_geometry(geometry) - self.kernels = CUDAFuncs(self.gpu.module, ['ray_trace', 'ray_trace_alpha', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng']) - self.width, self.height = size pygame.init() @@ -180,7 +178,7 @@ class Camera(Thread): @timeit def initialize_render(self): self.rng_states_gpu = cuda.mem_alloc(self.width*self.height*sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - self.kernels.init_rng(np.int32(self.width*self.height), self.rng_states_gpu, np.int32(0), np.int32(0), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.init_rng(np.int32(self.width*self.height), self.rng_states_gpu, np.int32(0), np.int32(0), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) self.xyz_lookup1_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3) self.xyz_lookup2_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3) self.image_gpu = gpuarray.zeros(self.width*self.height, dtype=gpuarray.vec.float3) @@ -200,13 +198,13 @@ class Camera(Thread): def update_xyz_lookup(self, source_position): for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1): - self.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1): - self.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1): - self.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) self.nlookup_calls += 1 @@ -216,16 +214,16 @@ class Camera(Thread): self.nimages = 0 def update_image(self): - self.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) - self.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) - self.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) + self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1)) self.nimages += 1 def process_image(self): - self.kernels.process_image(np.int32(self.width*self.height), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.width*self.height)//self.nblocks+1,1)) + self.gpu.kernels.process_image(np.int32(self.width*self.height), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.width*self.height)//self.nblocks+1,1)) def screenshot(self, dir='', start=0): root, ext = 'screenshot', 'png' @@ -240,8 +238,8 @@ class Camera(Thread): print 'image saved to %s' % filename def rotate(self, phi, n): - self.kernels.rotate(np.int32(self.pixels_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) - self.kernels.rotate(np.int32(self.pixels_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) + self.gpu.kernels.rotate(np.int32(self.pixels_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) + self.gpu.kernels.rotate(np.int32(self.pixels_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) self.point = rotate(self.point, phi, n) self.axis1 = rotate(self.axis1, phi, n) @@ -253,8 +251,8 @@ class Camera(Thread): self.update() def rotate_around_point(self, phi, n, point, redraw=True): - self.kernels.rotate_around_point(np.int32(self.origins_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), gpuarray.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.origins_gpu.size//self.nblocks+1,1)) - self.kernels.rotate(np.int32(self.directions_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.directions_gpu.size//self.nblocks+1,1)) + self.gpu.kernels.rotate_around_point(np.int32(self.origins_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), gpuarray.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.origins_gpu.size//self.nblocks+1,1)) + self.gpu.kernels.rotate(np.int32(self.directions_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.directions_gpu.size//self.nblocks+1,1)) self.axis1 = rotate(self.axis1, phi, n) self.axis2 = rotate(self.axis2, phi, n) @@ -266,7 +264,7 @@ class Camera(Thread): self.update() def translate(self, v, redraw=True): - self.kernels.translate(np.int32(self.pixels_gpu.size), self.origins_gpu, gpuarray.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks,1)) + self.gpu.kernels.translate(np.int32(self.pixels_gpu.size), self.origins_gpu, gpuarray.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks,1)) self.point += v @@ -284,9 +282,9 @@ class Camera(Thread): self.process_image() else: if self.alpha: - self.kernels.ray_trace_alpha(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) + self.gpu.kernels.ray_trace_alpha(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) else: - self.kernels.ray_trace(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) + self.gpu.kernels.ray_trace(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1)) pygame.surfarray.blit_array(self.screen, self.pixels_gpu.get().reshape(self.size)) if self.doom_mode: @@ -624,4 +622,6 @@ if __name__ == '__main__': camera = EventViewer(geometry, options.io_file, size=size) else: camera = Camera(geometry, size) + camera.start() + camera.join() @@ -1,13 +1,30 @@ import numpy as np +from chroma.sample import uniform_sphere + class Photons(object): - def __init__(self, positions, directions, polarizations, times, wavelengths, - last_hit_triangles=None, histories=None): + def __init__(self, positions, directions, wavelengths, polarizations=None, times=None, last_hit_triangles=None, histories=None): self.positions = positions + + nphotons = len(positions) + + assert len(directions) == nphotons self.directions = directions - self.polarizations = polarizations - self.times = times + + assert len(wavelengths) == nphotons self.wavelengths = wavelengths + + if polarizations is not None: + self.polarizations = polarizations + else: + # polarizations are isotropic in plane perpendicular + # to the direction vectors + polarizations = np.cross(directions, uniform_sphere(nphotons)) + # normalize polarization vectors + polarizations /= np.tile(np.apply_along_axis(np.linalg.norm,1,polarizations),[3,1]).transpose() + self.polarizations = np.asarray(polarizations, order='C') + + self.times = times self.last_hit_triangles = last_hit_triangles self.histories = histories @@ -18,6 +18,9 @@ import event cuda.init() +def to_float3(arr): + return arr.astype(np.float32).view(gpuarray.vec.float3) + def boolean_argsort(condition): '''Returns two arrays of indicies indicating which elements of the boolean array `condition` should be exchanged so that all of the @@ -84,28 +87,31 @@ class CUDAFuncs(object): setattr(self, name, cuda_module.get_function(name)) class GPU(object): - def __init__(self, device_id=None): + def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024): '''Initialize a GPU context on the specified device. If device_id==None, the default device is used.''' if device_id is None: self.context = make_default_context() else: - print 'device_id = %s' % device_id device = cuda.Device(device_id) self.context = device.make_context() + print 'device %s' % self.context.get_device().name() + self.nthreads_per_block = nthreads_per_block + self.max_blocks = max_blocks + self.context.set_cache_config(cuda.func_cache.PREFER_L1) cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v'] self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True) + self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng']) + self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap']) - self.nthread_per_block = 64 - self.max_blocks = 1024 self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True) self.daq_funcs = CUDAFuncs(self.daq_module, ['reset_earliest_time_int', 'run_daq', @@ -121,7 +127,7 @@ class GPU(object): print format_array('node_map_end', self.node_map_end_gpu) print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes)) - def load_geometry(self, geometry): + def load_geometry(self, geometry, print_usage=True): if not hasattr(geometry, 'mesh'): print 'WARNING: building geometry with 8-bits' geometry.build(bits=8) @@ -159,7 +165,7 @@ class GPU(object): self.surfaces.append(surface) - self.vertices_gpu = gpuarray.to_gpu(geometry.mesh.vertices.astype(np.float32).view(gpuarray.vec.float3)) + self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices)) triangles = \ np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4) @@ -169,17 +175,9 @@ class GPU(object): triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8) self.triangles_gpu = gpuarray.to_gpu(triangles) - lower_bounds_float3 = np.empty(geometry.lower_bounds.shape[0], dtype=gpuarray.vec.float3) - lower_bounds_float3['x'] = geometry.lower_bounds[:,0] - lower_bounds_float3['y'] = geometry.lower_bounds[:,1] - lower_bounds_float3['z'] = geometry.lower_bounds[:,2] - self.lower_bounds_gpu = gpuarray.to_gpu(lower_bounds_float3) + self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds)) - upper_bounds_float3 = np.empty(geometry.upper_bounds.shape[0], dtype=gpuarray.vec.float3) - upper_bounds_float3['x'] = geometry.upper_bounds[:,0] - upper_bounds_float3['y'] = geometry.upper_bounds[:,1] - upper_bounds_float3['z'] = geometry.upper_bounds[:,2] - self.upper_bounds_gpu = gpuarray.to_gpu(upper_bounds_float3) + self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds)) self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32)) self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32)) @@ -199,14 +197,11 @@ class GPU(object): self.geometry = geometry - print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:])) - - print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1)) - - print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1)) - - - self.print_device_usage() + if print_usage: + print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:])) + print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1)) + print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1)) + self.print_device_usage() def reset_colors(self): self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32)) @@ -215,22 +210,22 @@ class GPU(object): solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool)) solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32)) - for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthread_per_block, self.max_blocks): + for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks): self.geo_funcs.color_solids(np.int32(first_triangle), np.int32(triangles_this_round), self.solid_id_map_gpu, solid_hit_gpu, solid_colors_gpu, - block=(self.nthread_per_block,1,1), + block=(self.nthreads_per_block,1,1), grid=(blocks,1)) self.context.synchronize() def setup_propagate(self, seed=1): - self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks + self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks * sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu, + self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu, np.uint64(seed), np.uint64(0), - block=(self.nthread_per_block,1,1), + block=(self.nthreads_per_block,1,1), grid=(self.max_blocks,1)) #self.context.synchronize() @@ -244,13 +239,16 @@ class GPU(object): assert len(photons.directions) == self.nphotons assert len(photons.polarizations) == self.nphotons assert len(photons.wavelengths) == self.nphotons - assert len(photons.times) == self.nphotons - self.positions_gpu = gpuarray.to_gpu(photons.positions.astype(np.float32).view(gpuarray.vec.float3)) - self.directions_gpu = gpuarray.to_gpu(photons.directions.astype(np.float32).view(gpuarray.vec.float3)) - self.polarizations_gpu = gpuarray.to_gpu(photons.polarizations.astype(np.float32).view(gpuarray.vec.float3)) + self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions))#.astype(np.float32).view(gpuarray.vec.float3)) + self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions))#.astype(np.float32).view(gpuarray.vec.float3)) + self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations))#.astype(np.float32).view(gpuarray.vec.float3)) self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32)) - self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32)) + + if photons.times is not None: + self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32)) + else: + self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32) if photons.histories is not None: self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32)) @@ -282,12 +280,12 @@ class GPU(object): while step < max_steps: ## Just finish the rest of the steps if the # of photons is low - if nphotons < self.nthread_per_block * 16 * 8: + if nphotons < self.nthreads_per_block * 16 * 8: nsteps = max_steps - step else: nsteps = 1 - for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks): + for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks): self.prop_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], @@ -298,7 +296,7 @@ class GPU(object): self.times_gpu, self.histories_gpu, self.last_hit_triangles_gpu, np.int32(nsteps), - block=(self.nthread_per_block,1,1), + block=(self.nthreads_per_block,1,1), grid=(blocks, 1)) step += nsteps @@ -341,13 +339,13 @@ class GPU(object): self.daq_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, - block=(self.nthread_per_block,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1)) + block=(self.nthreads_per_block,1,1), + grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1)) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) #self.context.synchronize() - for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): + for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks): self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), @@ -359,13 +357,13 @@ class GPU(object): self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, - block=(self.nthread_per_block,1,1), grid=(blocks,1)) + block=(self.nthreads_per_block,1,1), grid=(blocks,1)) #self.context.synchronize() self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, - block=(self.nthread_per_block,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1)) + block=(self.nthreads_per_block,1,1), + grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1)) if 'profile' in __builtins__: self.context.synchronize() |