diff options
Diffstat (limited to 'gpu.py')
-rw-r--r-- | gpu.py | 655 |
1 files changed, 338 insertions, 317 deletions
@@ -1,67 +1,215 @@ import numpy as np import numpy.ma as ma -from pycuda.tools import make_default_context -from pycuda.compiler import SourceModule -from pycuda.characterize import sizeof -import pycuda.driver as cuda -from pycuda import gpuarray from copy import copy from itertools import izip -from chroma.tools import timeit from os.path import dirname +import sys + +import pytools +import pycuda.tools +from pycuda.compiler import SourceModule +import pycuda.characterize +import pycuda.driver as cuda +from pycuda import gpuarray as ga import chroma.src +from chroma.tools import timeit from chroma.geometry import standard_wavelengths from chroma.color import map_to_color -import sys -import event +from chroma import event cuda.init() +#@pycuda.tools.context_dependent_memoize +def get_cu_module(name, options=None, include_source_directory=True): + """Returns a pycuda.compiler.SourceModule object from a CUDA source file + located in the chroma src directory at src/[name].cu.""" + if options is None: + options = [] + + srcdir = dirname(chroma.src.__file__) + + if include_source_directory: + options += ['-I' + srcdir] + + with open('%s/%s.cu' % (srcdir, name)) as f: + source = f.read() + + return pycuda.compiler.SourceModule(source, options=options, + no_extern_c=True) + +class GPUFuncs(object): + """Simple container class for GPU functions as attributes.""" + def __init__(self, module): + self.module = module + self.funcs = {} + + def __getattr__(self, name): + try: + return self.funcs[name] + except KeyError: + f = self.module.get_function(name) + self.funcs[name] = f + return f + +init_rng_src = """ +#include <curand_kernel.h> + +extern "C" +{ + +__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + curand_init(seed, id, offset, &s[id]); +} + +} // extern "C" +""" + +def get_rng_states(size, seed=1): + "Return `size` number of CUDA random number generator states." + rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) + + module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) + init_rng = module.get_function('init_rng') + + init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) + + return rng_states + def to_float3(arr): - return arr.astype(np.float32).view(gpuarray.vec.float3)[:,0] - -def boolean_argsort(condition): - '''Returns two arrays of indicies indicating which elements of the - boolean array `condition` should be exchanged so that all of the - True entries occur before the false entries. - - Returns: tuple (a,b, ntrue) which give the indices for elements - to exchange in a and b, and the number of true entries in the array - in ntrue. This function in general requires fewer swaps than - numpy.argsort. - ''' - - true_indices = condition.nonzero()[0][::-1] # reverse - false_indices = (~condition).nonzero()[0] - - length = min(len(true_indices), len(false_indices)) - - cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True) - return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices)) - - -def chunk_iterator(nelements, nthreads_per_block, max_blocks): - '''Iterator that yields tuples with the values requried to process + "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array." + if not arr.flags['C_CONTIGUOUS']: + arr = np.asarray(arr, order='c') + return arr.astype(np.float32).view(ga.vec.float3)[:,0] + +def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): + """Iterator that yields tuples with the values requried to process a long array in multiple kernel passes on the GPU. - Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration) + Each yielded value is of the form, + (first_index, elements_this_iteration, nblocks_this_iteration) - >>> list(chunk_iterator(300, 32, 2)) - [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] - ''' + Example: + >>> list(chunk_iterator(300, 32, 2)) + [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] + """ first = 0 while first < nelements: elements_left = nelements - first blocks = int(elements_left // nthreads_per_block) if elements_left % nthreads_per_block != 0: - blocks += 1 #Round up only if needed + blocks += 1 # Round up only if needed blocks = min(max_blocks, blocks) elements_this_round = min(elements_left, blocks * nthreads_per_block) yield (first, elements_this_round, blocks) first += elements_this_round - + +class GPUPhotons(object): + def __init__(self, photons): + self.pos = ga.to_gpu(to_float3(photons.pos)) + self.dir = ga.to_gpu(to_float3(photons.dir)) + self.pol = ga.to_gpu(to_float3(photons.pol)) + self.wavelengths = ga.to_gpu(photons.wavelengths.astype(np.float32)) + self.t = ga.to_gpu(photons.t.astype(np.float32)) + self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32)) + self.flags = ga.to_gpu(photons.flags.astype(np.uint32)) + + def get(self): + pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) + dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) + pol = self.pol.get().view(np.float32).reshape((len(self.pol),3)) + wavelengths = self.wavelengths.get() + t = self.t.get() + last_hit_triangles = self.last_hit_triangles.get() + flags = self.flags.get() + return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) + +class GPUChannels(object): + def __init__(self, t, q, flags): + self.t = t + self.q = q + self.flags = flags + + def get(self): + t = self.t.get() + q = self.q.get().astype(np.float32) + + # For now, assume all channels with small + # enough hit time were hit. + return event.Channels(t<1e8, t, q, self.flags.get()) + +def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10): + """Propagate photons on GPU to termination or max_steps, whichever + comes first. + + May be called repeatedly without reloading photon information if + single-stepping through photon history. + + ..warning:: + `rng_states` must have at least `nthreads_per_block`*`max_blocks` + number of curandStates. + """ + nphotons = gpuphotons.pos.size + step = 0 + input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) + input_queue[1:] = np.arange(nphotons, dtype=np.uint32) + input_queue_gpu = ga.to_gpu(input_queue) + output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) + output_queue[0] = 1 + output_queue_gpu = ga.to_gpu(output_queue) + + propagate = gpu.module.get_function('propagate') + + while step < max_steps: + # Just finish the rest of the steps if the # of photons is low + if nphotons < nthreads_per_block * 16 * 8: + nsteps = max_steps - step + else: + nsteps = 1 + + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1)) + + step += nsteps + + if step < max_steps: + temp = input_queue_gpu + input_queue_gpu = output_queue_gpu + output_queue_gpu = temp + output_queue_gpu[:1].set(np.uint32(1)) + nphotons = input_queue_gpu[:1].get()[0] - 1 + + if ga.max(gpuphotons.flags).get() & (1 << 31): + print >>sys.stderr, "WARNING: ABORTED PHOTONS" + +class GPURays(object): + def __init__(self, pos, dir, nblocks=64): + self.pos = ga.to_gpu(to_float3(pos)) + self.dir = ga.to_gpu(to_float3(dir)) + + self.nblocks = nblocks + + self.module = get_cu_module('transform') + self.gpu_funcs = GPUFuncs(self.module) + + def rotate(self, phi, n): + self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) + + def rotate_around_point(self, phi, n, point): + self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) + + def translate(self, v): + self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) def format_size(size): if size < 1e3: @@ -77,317 +225,165 @@ def format_array(name, array): return '%-15s %6s %6s' % \ (name, format_size(len(array)), format_size(array.nbytes)) -class CUDAFuncs(object): - '''simple container class for GPU functions as attributes''' - def __init__(self, cuda_module, func_names): - '''Extract __global__ functions listed in func_names from - the PyCUDA module object. The functions are assigned - to attributes of this object with the same name.''' - for name in func_names: - setattr(self, name, cuda_module.get_function(name)) - -class GPU(object): - def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024): - '''Initialize a GPU context on the specified device. If device_id==None, - the default device is used.''' - - if device_id is None: - self.context = make_default_context() - else: - device = cuda.Device(device_id) - self.context = device.make_context() - - print 'device %s' % self.context.get_device().name() - - self.nthreads_per_block = nthreads_per_block - self.max_blocks = max_blocks - - self.context.set_cache_config(cuda.func_cache.PREFER_L1) - - cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v'] - - self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True) - self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'distance_to_mesh', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng']) - - self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) - - self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap']) - self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True) - self.daq_funcs = CUDAFuncs(self.daq_module, - ['reset_earliest_time_int', 'run_daq', - 'convert_sortable_int_to_float', 'bin_hits', - 'accumulate_pdf_eval']) +class GPUGeometry(object): + def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True): + self.geometry = geometry - def print_device_usage(self): - print 'device usage:' - print format_array('vertices', self.vertices_gpu) - print format_array('triangles', self.triangles_gpu) - print format_array('lower_bounds', self.lower_bounds_gpu) - print format_array('upper_bounds', self.upper_bounds_gpu) - print format_array('node_map', self.node_map_gpu) - print format_array('node_map_end', self.node_map_end_gpu) - print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes)) + self.module = gpu.module + self.gpu_funcs = GPUFuncs(gpu.module) - def load_geometry(self, geometry, print_usage=True): - if not hasattr(geometry, 'mesh'): - print 'WARNING: building geometry with 8-bits' - geometry.build(bits=8) + if load: + self.load(activate, print_usage) - self.geo_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1)) + def load(self, activate=True, print_usage=True): + self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1)) self.materials = [] - for i in range(len(geometry.unique_materials)): - material = copy(geometry.unique_materials[i]) + for i in range(len(self.geometry.unique_materials)): + material = copy(self.geometry.unique_materials[i]) if material is None: raise Exception('one or more triangles is missing a material.') - material.refractive_index_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32)) - material.absorption_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32)) - material.scattering_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32)) + material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32)) + material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32)) + material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32)) - self.geo_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1)) + self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1)) self.materials.append(material) self.surfaces = [] - for i in range(len(geometry.unique_surfaces)): - surface = copy(geometry.unique_surfaces[i]) + for i in range(len(self.geometry.unique_surfaces)): + surface = copy(self.geometry.unique_surfaces[i]) if surface is None: continue - surface.detect_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32)) - surface.absorb_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32)) - surface.reflect_diffuse_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32)) - surface.reflect_specular_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32)) + surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32)) + surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32)) + surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32)) + surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32)) - self.geo_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1)) + self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1)) self.surfaces.append(surface) - self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices)) + self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices)) triangles = \ - np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4) - triangles['x'] = geometry.mesh.triangles[:,0] - triangles['y'] = geometry.mesh.triangles[:,1] - triangles['z'] = geometry.mesh.triangles[:,2] - triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8) - self.triangles_gpu = gpuarray.to_gpu(triangles) + np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4) + triangles['x'] = self.geometry.mesh.triangles[:,0] + triangles['y'] = self.geometry.mesh.triangles[:,1] + triangles['z'] = self.geometry.mesh.triangles[:,2] + triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8) + self.triangles_gpu = ga.to_gpu(triangles) - self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds)) + self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds)) - self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds)) + self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds)) - self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32)) - self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32)) - self.node_map_end_gpu = gpuarray.to_gpu(geometry.node_map_end.astype(np.uint32)) - self.solid_id_map_gpu = gpuarray.to_gpu(geometry.solid_id.astype(np.uint32)) - - self.geo_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(geometry.node_map.size-1), np.uint32(geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1)) + self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32)) + self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32)) + self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32)) + self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32)) self.node_map_tex = self.module.get_texref('node_map') self.node_map_end_tex = self.module.get_texref('node_map_end') + if print_usage: + self.print_device_usage() + + if activate: + self.activate() + + def activate(self): + self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1)) + self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes) self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes) self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1) self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1) - self.geometry = geometry - - if print_usage: - print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:])) - print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1)) - print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1)) - self.print_device_usage() + def print_device_usage(self): + print 'device usage:' + print '-'*10 + print format_array('vertices', self.vertices_gpu) + print format_array('triangles', self.triangles_gpu) + print format_array('lower_bounds', self.lower_bounds_gpu) + print format_array('upper_bounds', self.upper_bounds_gpu) + print format_array('node_map', self.node_map_gpu) + print format_array('node_map_end', self.node_map_end_gpu) + print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes)) + print '-'*10 + free, total = cuda.mem_get_info() + print '%-15s %6s %6s' % ('device total', '', format_size(total)) + print '%-15s %6s %6s' % ('device used', '', format_size(total-free)) + print '%-15s %6s %6s' % ('device free', '', format_size(free)) + print def reset_colors(self): self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32)) def color_solids(self, solid_hit, colors): - solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool)) - solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32)) + solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) + solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) - for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks): - self.geo_funcs.color_solids(np.int32(first_triangle), + for first_triangle, triangles_this_round, blocks in \ + chunk_iterator(self.triangles_gpu.size): + self.gpu_funcs.color_solids(np.int32(first_triangle), np.int32(triangles_this_round), self.solid_id_map_gpu, solid_hit_gpu, solid_colors_gpu, - block=(self.nthreads_per_block,1,1), + block=(64,1,1), grid=(blocks,1)) - self.context.synchronize() - - def setup_propagate(self, seed=1): - self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks - * sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu, - np.uint64(seed), np.uint64(0), - block=(self.nthreads_per_block,1,1), - grid=(self.max_blocks,1)) - #self.context.synchronize() - - def load_photons(self, photons): - '''Load photons onto the GPU from an event.Photons object. - - If photon.histories or photon.last_hit_triangles are set to none, - they will be initialized to 0 and -1 on the GPU, respectively. - ''' - self.nphotons = len(photons.positions) - assert len(photons.directions) == self.nphotons - assert len(photons.polarizations) == self.nphotons - assert len(photons.wavelengths) == self.nphotons - - self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions)) - self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions)) - self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations)) - self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32)) - - if photons.times is not None: - self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32)) - else: - self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32) - if photons.histories is not None: - self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32)) - else: - self.histories_gpu = gpuarray.zeros(self.nphotons, dtype=np.uint32) - - if photons.last_hit_triangles is not None: - self.last_hit_triangles_gpu = gpuarray.to_gpu(photons.last_hit_triangles.astype(np.int32)) - else: - self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32) - self.last_hit_triangles_gpu.fill(-1) - - - def propagate(self, max_steps=10): - '''Propagate photons on GPU to termination or max_steps, whichever comes first. - - May be called repeatedly without reloading photon information if single-stepping - through photon history. - ''' - nphotons = self.nphotons - step = 0 - input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) - input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32) - input_queue_gpu = gpuarray.to_gpu(input_queue) - output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) - output_queue[0] = 1 - output_queue_gpu = gpuarray.to_gpu(output_queue) - - - while step < max_steps: - ## Just finish the rest of the steps if the # of photons is low - if nphotons < self.nthreads_per_block * 16 * 8: - nsteps = max_steps - step - else: - nsteps = 1 - - for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks): - self.prop_funcs.propagate(np.int32(first_photon), - np.int32(photons_this_round), - input_queue_gpu[1:], - output_queue_gpu, - self.rng_states_gpu, self.positions_gpu, - self.directions_gpu, - self.wavelengths_gpu, self.polarizations_gpu, - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - np.int32(nsteps), - block=(self.nthreads_per_block,1,1), - grid=(blocks, 1)) - - step += nsteps - - if step < max_steps: - temp = input_queue_gpu - input_queue_gpu = output_queue_gpu - output_queue_gpu = temp - output_queue_gpu[:1].set(np.uint32(1)) - nphotons = input_queue_gpu[:1].get()[0] - 1 - - if gpuarray.max(self.histories_gpu).get() & (1 << 31): - print >>sys.stderr, "WARNING: ABORTED PHOTONS IN THIS EVENT" - - if 'profile' in __builtins__: - self.context.synchronize() - - def get_photons(self): - '''Returns a dictionary of current photon state information. - - Contents of dictionary have the same names as the parameters to load_photons(). - ''' - return event.Photons(positions=self.positions_gpu.get().view(np.float32).reshape(self.positions_gpu.size, 3), - directions=self.directions_gpu.get().view(np.float32).reshape(self.directions_gpu.size, 3), - polarizations=self.polarizations_gpu.get().view(np.float32).reshape(self.polarizations_gpu.size, 3), - wavelengths=self.wavelengths_gpu.get(), - times=self.times_gpu.get(), - histories=self.histories_gpu.get(), - last_hit_triangles=self.last_hit_triangles_gpu.get()) - - def setup_daq(self, max_pmt_id, pmt_rms=1.2e-9): - self.earliest_time_gpu = gpuarray.GPUArray(shape=(max_pmt_id+1,), dtype=np.float32) - self.earliest_time_int_gpu = gpuarray.GPUArray(shape=self.earliest_time_gpu.shape, - dtype=np.uint32) - self.channel_history_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu) - self.channel_q_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu) +class GPUDaq(object): + def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9): + self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32) + self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32) + self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) + self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.daq_pmt_rms = pmt_rms + self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu + + self.module = get_cu_module('daq', include_source_directory=False) + self.gpu_funcs = GPUFuncs(self.module) - def run_daq(self): - self.daq_funcs.reset_earliest_time_int(np.float32(1e9), - np.int32(len(self.earliest_time_int_gpu)), - self.earliest_time_int_gpu, - block=(self.nthreads_per_block,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1)) + def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): + self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) - #self.context.synchronize() - for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks): - self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), - np.float32(self.daq_pmt_rms), - np.int32(first_photon), - np.int32(photons_this_round), - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - self.solid_id_map_gpu, - np.int32(len(self.earliest_time_int_gpu)), - self.earliest_time_int_gpu, - self.channel_q_gpu, - self.channel_history_gpu, - block=(self.nthreads_per_block,1,1), grid=(blocks,1)) - #self.context.synchronize() - self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), - self.earliest_time_int_gpu, - self.earliest_time_gpu, - block=(self.nthreads_per_block,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1)) - if 'profile' in __builtins__: - self.context.synchronize() - + n = len(gpuphotons.pos) - def get_hits(self): - t = self.earliest_time_gpu.get() - # For now, assume all channels with small enough hit time were hit - return event.Channels(hit=t<1e8, t=t, - q=self.channel_q_gpu.get().astype(np.float32), - histories=self.channel_history_gpu.get()) + for first_photon, photons_this_round, blocks in \ + chunk_iterator(n, nthreads_per_block, max_blocks): + self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1)) + + self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) + + return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu) + +class GPUPDF(object): + def __init__(self): + self.module = get_cu_module('daq') + self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange): - '''Setup GPU arrays to hold PDF information. + """Setup GPU arrays to hold PDF information. max_pmt_id: int, largest PMT id # tbins: number of time bins trange: tuple of (min, max) time in PDF qbins: number of charge bins qrange: tuple of (min, max) charge in PDF - ''' + """ self.events_in_histogram = 0 - self.hitcount_gpu = gpuarray.zeros(max_pmt_id+1, dtype=np.uint32) - self.pdf_gpu = gpuarray.zeros(shape=(max_pmt_id+1, tbins, qbins), + self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32) + self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), dtype=np.uint32) self.tbins = tbins self.trange = trange @@ -395,16 +391,14 @@ class GPU(object): self.qrange = qrange def clear_pdf(self): - '''Rezero the PDF counters.''' + """Rezero the PDF counters.""" self.hitcount_gpu.fill(0) self.pdf_gpu.fill(0) - def add_hits_to_pdf(self): - '''Put the most recent results of run_daq() into the PDF.''' - - self.daq_funcs.bin_hits(np.int32(len(self.hitcount_gpu)), - self.channel_q_gpu, - self.earliest_time_gpu, + def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): + self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)), + gpuchannels.q, + gpuchannels.t, self.hitcount_gpu, np.int32(self.tbins), np.float32(self.trange[0]), @@ -413,20 +407,21 @@ class GPU(object): np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.pdf_gpu, - block=(self.nthreads_per_block,1,1), - grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1)) + block=(nthreads_per_block,1,1), + grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) self.events_in_histogram += 1 def get_pdfs(self): - '''Returns the 1D hitcount array and the 3D [channel, time, charge] histogram''' + """Returns the 1D hitcount array and the 3D [channel, time, charge] + histogram.""" return self.hitcount_gpu.get(), self.pdf_gpu.get() - def setup_pdf_eval(self, event_hit, event_time, event_charge, - min_twidth, trange, min_qwidth, qrange, min_bin_content=10, + def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, + trange, min_qwidth, qrange, min_bin_content=10, time_only=True): - '''Setup GPU arrays to compute PDF values for the given event. + """Setup GPU arrays to compute PDF values for the given event. The pdf_eval calculation allows the PDF to be evaluated at a single point for each channel as the Monte Carlo is run. The effective bin size will be as small as (`min_twidth`, @@ -455,14 +450,14 @@ class GPU(object): The bin will be expanded to include at least this many events time_only: bool If True, only the time observable will be used in the PDF. - ''' - self.event_hit_gpu = gpuarray.to_gpu(event_hit.astype(np.uint32)) - self.event_time_gpu = gpuarray.to_gpu(event_time.astype(np.float32)) - self.event_charge_gpu = gpuarray.to_gpu(event_charge.astype(np.float32)) - - self.eval_hitcount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32) - self.eval_bincount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32) - self.nearest_mc_gpu = gpuarray.empty(shape=len(event_hit) * min_bin_content, + """ + self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) + self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) + self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) + + self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) + self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) + self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) @@ -474,21 +469,20 @@ class GPU(object): self.time_only = time_only def clear_pdf_eval(self): - '''Reset PDF evaluation counters to start accumulating new Monte Carlo.''' + "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.eval_hitcount_gpu.fill(0) self.eval_bincount_gpu.fill(0) self.nearest_mc_gpu.fill(1e9) - def accumulate_pdf_eval(self): - '''Add the most recent results of run_daq() to the PDF evaluation.''' - - self.daq_funcs.accumulate_pdf_eval(np.int32(self.time_only), + def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64): + "Add the most recent results of run_daq() to the PDF evaluation." + self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only), np.int32(len(self.event_hit_gpu)), self.event_hit_gpu, self.event_time_gpu, self.event_charge_gpu, - self.earliest_time_gpu, - self.channel_q_gpu, + gpuchannels.t, + gpuchannels.q, self.eval_hitcount_gpu, self.eval_bincount_gpu, np.float32(self.min_twidth), @@ -499,8 +493,8 @@ class GPU(object): np.float32(self.qrange[1]), self.nearest_mc_gpu, np.int32(self.min_bin_content), - block=(self.nthreads_per_block,1,1), - grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1)) + block=(nthreads_per_block,1,1), + grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) def get_pdf_eval(self): evhit = self.event_hit_gpu.get().astype(bool) @@ -525,8 +519,9 @@ class GPU(object): nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content)) - # Deal with the case where we did not even get min_bin_content events in the PDF - # but also clamp the lower range to ensure we don't index by a negative number in 2 lines + # Deal with the case where we did not even get min_bin_content events + # in the PDF but also clamp the lower range to ensure we don't index + # by a negative number in 2 lines last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] @@ -544,5 +539,31 @@ class GPU(object): return hitcount, pdf_value, pdf_value * pdf_frac_uncert +class GPU(object): + def __init__(self, device_id=None): + """Initialize a GPU context on the specified device. + If device_id is None, the default device is used.""" + + if device_id is None: + self.context = pycuda.tools.make_default_context() + else: + device = cuda.Device(device_id) + self.context = device.make_context() + + print 'device %s' % self.context.get_device().name() + + self.print_mem_info() + + self.context.set_cache_config(cuda.func_cache.PREFER_L1) + + cuda_options = ['--use_fast_math']#, '--ptxas-options=-v'] + + self.module = get_cu_module('kernel', options=cuda_options) + + def print_mem_info(self): + free, total = cuda.mem_get_info() + + print '%.1f %% of device memory is free.' % ((free/float(total))*100) + def __del__(self): self.context.pop() |