diff options
Diffstat (limited to 'gpu.py')
-rw-r--r-- | gpu.py | 799 |
1 files changed, 0 insertions, 799 deletions
@@ -1,799 +0,0 @@ -import numpy as np -import numpy.ma as ma -from copy import copy -from itertools import izip -import os -import sys -import gc - -import pytools -import pycuda.tools -from pycuda.compiler import SourceModule -from pycuda import characterize -import pycuda.driver as cuda -from pycuda import gpuarray as ga - -import chroma.src -from chroma.tools import timeit, profile_if_possible -from chroma.geometry import standard_wavelengths -from chroma import event - -cuda.init() - -# standard nvcc options -cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v'] - -@pycuda.tools.context_dependent_memoize -def get_cu_module(name, options=None, include_source_directory=True): - """Returns a pycuda.compiler.SourceModule object from a CUDA source file - located in the chroma src directory at src/[name].""" - if options is None: - options = [] - elif isinstance(options, tuple): - options = list(options) - else: - raise TypeError('`options` must be a tuple.') - - srcdir = os.path.dirname(os.path.abspath(chroma.src.__file__)) - - if include_source_directory: - options += ['-I' + srcdir] - - with open('%s/%s' % (srcdir, name)) as f: - source = f.read() - - return pycuda.compiler.SourceModule(source, options=options, - no_extern_c=True) - -@pycuda.tools.memoize -def get_cu_source(name): - """Get the source code for a CUDA source file located in the chroma src - directory at src/[name].""" - srcdir = os.path.dirname(os.path.abspath(chroma.src.__file__)) - with open('%s/%s' % (srcdir, name)) as f: - source = f.read() - return source - -class GPUFuncs(object): - """Simple container class for GPU functions as attributes.""" - def __init__(self, module): - self.module = module - self.funcs = {} - - def __getattr__(self, name): - try: - return self.funcs[name] - except KeyError: - f = self.module.get_function(name) - self.funcs[name] = f - return f - -init_rng_src = """ -#include <curand_kernel.h> - -extern "C" -{ - -__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) -{ - int id = blockIdx.x*blockDim.x + threadIdx.x; - - if (id >= nthreads) - return; - - curand_init(seed, id, offset, &s[id]); -} - -} // extern "C" -""" - -def get_rng_states(size, seed=1): - "Return `size` number of CUDA random number generator states." - rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - - module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) - init_rng = module.get_function('init_rng') - - init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) - - return rng_states - -def to_float3(arr): - "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array." - if not arr.flags['C_CONTIGUOUS']: - arr = np.asarray(arr, order='c') - return arr.astype(np.float32).view(ga.vec.float3)[:,0] - -def to_uint3(arr): - "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array." - if not arr.flags['C_CONTIGUOUS']: - arr = np.asarray(arr, order='c') - return arr.astype(np.uint32).view(ga.vec.uint3)[:,0] - -def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): - """Iterator that yields tuples with the values requried to process - a long array in multiple kernel passes on the GPU. - - Each yielded value is of the form, - (first_index, elements_this_iteration, nblocks_this_iteration) - - Example: - >>> list(chunk_iterator(300, 32, 2)) - [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] - """ - first = 0 - while first < nelements: - elements_left = nelements - first - blocks = int(elements_left // nthreads_per_block) - if elements_left % nthreads_per_block != 0: - blocks += 1 # Round up only if needed - blocks = min(max_blocks, blocks) - elements_this_round = min(elements_left, blocks * nthreads_per_block) - - yield (first, elements_this_round, blocks) - first += elements_this_round - -class GPUPhotons(object): - def __init__(self, photons, ncopies=1): - """Load ``photons`` onto the GPU, replicating as requested. - - Args: - - photons: chroma.Event.Photons - Photon state information to load onto GPU - - ncopies: int, *optional* - Number of times to replicate the photons - on the GPU. This is used if you want - to propagate the same event many times, - for example in a likelihood calculation. - - The amount of GPU storage will be proportionally - larger if ncopies > 1, so be careful. - """ - nphotons = len(photons) - self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) - self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) - self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) - self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) - self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) - self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) - self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) - - # Assign the provided photons to the beginning (possibly - # the entire array if ncopies is 1 - self.pos[:nphotons].set(to_float3(photons.pos)) - self.dir[:nphotons].set(to_float3(photons.dir)) - self.pol[:nphotons].set(to_float3(photons.pol)) - self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) - self.t[:nphotons].set(photons.t.astype(np.float32)) - self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) - self.flags[:nphotons].set(photons.flags.astype(np.uint32)) - - module = get_cu_module('propagate.cu', options=cuda_options) - self.gpu_funcs = GPUFuncs(module) - - # Replicate the photons to the rest of the slots if needed - if ncopies > 1: - max_blocks = 1024 - nthreads_per_block = 64 - for first_photon, photons_this_round, blocks in \ - chunk_iterator(nphotons, nthreads_per_block, max_blocks): - self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), - self.pos, self.dir, self.wavelengths, self.pol, self.t, - self.flags, self.last_hit_triangles, - np.int32(ncopies-1), - np.int32(nphotons), - block=(nthreads_per_block,1,1), grid=(blocks, 1)) - - - # Save the duplication information for the iterate_copies() method - self.true_nphotons = nphotons - self.ncopies = ncopies - - def get(self): - pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) - dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) - pol = self.pol.get().view(np.float32).reshape((len(self.pol),3)) - wavelengths = self.wavelengths.get() - t = self.t.get() - last_hit_triangles = self.last_hit_triangles.get() - flags = self.flags.get() - return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) - - def iterate_copies(self): - '''Returns an iterator that yields GPUPhotonsSlice objects - corresponding to the event copies stored in ``self``.''' - for i in xrange(self.ncopies): - window = slice(self.true_nphotons*i, self.true_nphotons*(i+1)) - yield GPUPhotonsSlice(pos=self.pos[window], - dir=self.dir[window], - pol=self.pol[window], - wavelengths=self.wavelengths[window], - t=self.t[window], - last_hit_triangles=self.last_hit_triangles[window], - flags=self.flags[window]) - - @profile_if_possible - def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, - max_blocks=1024, max_steps=10): - """Propagate photons on GPU to termination or max_steps, whichever - comes first. - - May be called repeatedly without reloading photon information if - single-stepping through photon history. - - ..warning:: - `rng_states` must have at least `nthreads_per_block`*`max_blocks` - number of curandStates. - """ - nphotons = self.pos.size - step = 0 - input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) - input_queue[0] = 0 - # Order photons initially in the queue to put the clones next to each other - for copy in xrange(self.ncopies): - input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons - input_queue_gpu = ga.to_gpu(input_queue) - output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) - output_queue[0] = 1 - output_queue_gpu = ga.to_gpu(output_queue) - - while step < max_steps: - # Just finish the rest of the steps if the # of photons is low - if nphotons < nthreads_per_block * 16 * 8: - nsteps = max_steps - step - else: - nsteps = 1 - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(nphotons, nthreads_per_block, max_blocks): - self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) - - step += nsteps - - if step < max_steps: - temp = input_queue_gpu - input_queue_gpu = output_queue_gpu - output_queue_gpu = temp - output_queue_gpu[:1].set(np.uint32(1)) - nphotons = input_queue_gpu[:1].get()[0] - 1 - - if ga.max(self.flags).get() & (1 << 31): - print >>sys.stderr, "WARNING: ABORTED PHOTONS" - - def __del__(self): - del self.pos - del self.dir - del self.pol - del self.wavelengths - del self.t - del self.flags - del self.last_hit_triangles - # Free up GPU memory quickly if now available - gc.collect() - -class GPUPhotonsSlice(GPUPhotons): - '''A `slice`-like view of a subrange of another GPU photons array. - Works exactly like an instance of GPUPhotons, but the GPU storage - is taken from another GPUPhotons instance. - - Returned by the GPUPhotons.iterate_copies() iterator.''' - def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles, - flags): - '''Create new object using slices of GPUArrays from an instance - of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!''' - self.pos = pos - self.dir = dir - self.pol = pol - self.wavelengths = wavelengths - self.t = t - self.last_hit_triangles = last_hit_triangles - self.flags = flags - - module = get_cu_module('propagate.cu', options=cuda_options) - self.gpu_funcs = GPUFuncs(module) - - self.true_nphotons = len(pos) - self.ncopies = 1 - - def __del__(self): - pass # Do nothing, because we don't own any of our GPU memory - -class GPUChannels(object): - def __init__(self, t, q, flags): - self.t = t - self.q = q - self.flags = flags - - def get(self): - t = self.t.get() - q = self.q.get().astype(np.float32) - - # For now, assume all channels with small - # enough hit time were hit. - return event.Channels(t<1e8, t, q, self.flags.get()) - -class GPURays(object): - """The GPURays class holds arrays of ray positions and directions - on the GPU that are used to render a geometry.""" - def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): - self.pos = ga.to_gpu(to_float3(pos)) - self.dir = ga.to_gpu(to_float3(dir)) - - self.max_alpha_depth = max_alpha_depth - - self.nblocks = nblocks - - transform_module = get_cu_module('transform.cu', options=cuda_options) - self.transform_funcs = GPUFuncs(transform_module) - - render_module = get_cu_module('render.cu', options=cuda_options) - self.render_funcs = GPUFuncs(render_module) - - self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) - self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) - self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32) - - def rotate(self, phi, n): - "Rotate by an angle phi around the axis `n`." - self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) - - def rotate_around_point(self, phi, n, point): - """"Rotate by an angle phi around the axis `n` passing through - the point `point`.""" - self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) - - def translate(self, v): - "Translate the ray positions by the vector `v`." - self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - - def render(self, gpu_geometry, pixels, alpha_depth=10, - keep_last_render=False): - """Render `gpu_geometry` and fill the GPU array `pixels` with pixel - colors.""" - if not keep_last_render: - self.dxlen.fill(0) - - if alpha_depth > self.max_alpha_depth: - raise Exception('alpha_depth > max_alpha_depth') - - if not isinstance(pixels, ga.GPUArray): - raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray) - - if pixels.size != self.pos.size: - raise ValueError('`pixels`.size != number of rays') - - self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - - def snapshot(self, gpu_geometry, alpha_depth=10): - "Render `gpu_geometry` and return a numpy array of pixel colors." - pixels = ga.empty(self.pos.size, dtype=np.uint32) - self.render(gpu_geometry, pixels, alpha_depth) - return pixels.get() - -def make_gpu_struct(size, members): - struct = cuda.mem_alloc(size) - - i = 0 - for member in members: - if isinstance(member, ga.GPUArray): - member = member.gpudata - - if isinstance(member, cuda.DeviceAllocation): - if i % 8: - raise Exception('cannot align 64-bit pointer. ' - 'arrange struct member variables in order of ' - 'decreasing size.') - - cuda.memcpy_htod(int(struct)+i, np.intp(int(member))) - i += 8 - elif np.isscalar(member): - cuda.memcpy_htod(int(struct)+i, member) - i += member.nbytes - else: - raise TypeError('expected a GPU device pointer or scalar type.') - - return struct - -def format_size(size): - if size < 1e3: - return '%.1f%s' % (size, ' ') - elif size < 1e6: - return '%.1f%s' % (size/1e3, 'K') - elif size < 1e9: - return '%.1f%s' % (size/1e6, 'M') - else: - return '%.1f%s' % (size/1e9, 'G') - -def format_array(name, array): - return '%-15s %6s %6s' % \ - (name, format_size(len(array)), format_size(array.nbytes)) - -class GPUGeometry(object): - def __init__(self, geometry, wavelengths=None, print_usage=False): - if wavelengths is None: - wavelengths = standard_wavelengths - - try: - wavelength_step = np.unique(np.diff(wavelengths)).item() - except ValueError: - raise ValueError('wavelengths must be equally spaced apart.') - - geometry_source = get_cu_source('geometry.h') - material_struct_size = characterize.sizeof('Material', geometry_source) - surface_struct_size = characterize.sizeof('Surface', geometry_source) - geometry_struct_size = characterize.sizeof('Geometry', geometry_source) - - self.material_data = [] - self.material_ptrs = [] - - def interp_material_property(wavelengths, property): - # note that it is essential that the material properties be - # interpolated linearly. this fact is used in the propagation - # code to guarantee that probabilities still sum to one. - return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) - - for i in range(len(geometry.unique_materials)): - material = geometry.unique_materials[i] - - if material is None: - raise Exception('one or more triangles is missing a material.') - - refractive_index = interp_material_property(wavelengths, material.refractive_index) - refractive_index_gpu = ga.to_gpu(refractive_index) - absorption_length = interp_material_property(wavelengths, material.absorption_length) - absorption_length_gpu = ga.to_gpu(absorption_length) - scattering_length = interp_material_property(wavelengths, material.scattering_length) - scattering_length_gpu = ga.to_gpu(scattering_length) - - self.material_data.append(refractive_index_gpu) - self.material_data.append(absorption_length_gpu) - self.material_data.append(scattering_length_gpu) - - material_gpu = \ - make_gpu_struct(material_struct_size, - [refractive_index_gpu, absorption_length_gpu, - scattering_length_gpu, - np.uint32(len(wavelengths)), - np.float32(wavelength_step), - np.float32(wavelengths[0])]) - - self.material_ptrs.append(material_gpu) - - self.material_pointer_array = \ - make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) - - self.surface_data = [] - self.surface_ptrs = [] - - for i in range(len(geometry.unique_surfaces)): - surface = geometry.unique_surfaces[i] - - if surface is None: - # need something to copy to the surface array struct - # that is the same size as a 64-bit pointer. - # this pointer will never be used by the simulation. - self.surface_ptrs.append(np.uint64(0)) - continue - - detect = interp_material_property(wavelengths, surface.detect) - detect_gpu = ga.to_gpu(detect) - absorb = interp_material_property(wavelengths, surface.absorb) - absorb_gpu = ga.to_gpu(absorb) - reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) - reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) - reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) - reflect_specular_gpu = ga.to_gpu(reflect_specular) - - self.surface_data.append(detect_gpu) - self.surface_data.append(absorb_gpu) - self.surface_data.append(reflect_diffuse_gpu) - self.surface_data.append(reflect_specular_gpu) - - surface_gpu = \ - make_gpu_struct(surface_struct_size, - [detect_gpu, absorb_gpu, - reflect_diffuse_gpu, - reflect_specular_gpu, - np.uint32(len(wavelengths)), - np.float32(wavelength_step), - np.float32(wavelengths[0])]) - - self.surface_ptrs.append(surface_gpu) - - self.surface_pointer_array = \ - make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) - - self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices)) - self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles)) - - material_codes = (((geometry.material1_index & 0xff) << 24) | - ((geometry.material2_index & 0xff) << 16) | - ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) - - self.material_codes = ga.to_gpu(material_codes) - - self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds)) - self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds)) - self.colors = ga.to_gpu(geometry.colors.astype(np.uint32)) - self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32)) - self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32)) - self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) - - self.gpudata = make_gpu_struct(geometry_struct_size, - [self.vertices, self.triangles, - self.material_codes, - self.colors, self.lower_bounds, - self.upper_bounds, self.node_map, - self.node_map_end, - self.material_pointer_array, - self.surface_pointer_array, - np.uint32(geometry.start_node), - np.uint32(geometry.first_node)]) - - self.geometry = geometry - - if print_usage: - self.print_device_usage() - - def print_device_usage(self): - print 'device usage:' - print '-'*10 - print format_array('vertices', self.vertices) - print format_array('triangles', self.triangles) - print format_array('lower_bounds', self.lower_bounds) - print format_array('upper_bounds', self.upper_bounds) - print format_array('node_map', self.node_map) - print format_array('node_map_end', self.node_map_end) - print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes)) - print '-'*10 - free, total = cuda.mem_get_info() - print '%-15s %6s %6s' % ('device total', '', format_size(total)) - print '%-15s %6s %6s' % ('device used', '', format_size(total-free)) - print '%-15s %6s %6s' % ('device free', '', format_size(free)) - print - - def reset_colors(self): - self.colors.set_async(self.geometry.colors.astype(np.uint32)) - - def color_solids(self, solid_hit, colors, nblocks_per_thread=64, - max_blocks=1024): - solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) - solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) - - module = get_cu_module('mesh.h', options=cuda_options) - color_solids = module.get_function('color_solids') - - for first_triangle, triangles_this_round, blocks in \ - chunk_iterator(self.triangles.size, nblocks_per_thread, - max_blocks): - color_solids(np.int32(first_triangle), - np.int32(triangles_this_round), self.solid_id_map, - solid_hit_gpu, solid_colors_gpu, self.gpudata, - block=(nblocks_per_thread,1,1), - grid=(blocks,1)) - -class GPUDaq(object): - def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9): - self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32) - self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32) - self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) - self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu) - self.daq_pmt_rms = pmt_rms - self.solid_id_map_gpu = gpu_geometry.solid_id_map - - self.module = get_cu_module('daq.cu', options=cuda_options, - include_source_directory=False) - self.gpu_funcs = GPUFuncs(self.module) - - def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): - self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) - self.channel_q_gpu.fill(0) - self.channel_history_gpu.fill(0) - - n = len(gpuphotons.pos) - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(n, nthreads_per_block, max_blocks): - self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1)) - - self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) - - return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu) - -class GPUPDF(object): - def __init__(self): - self.module = get_cu_module('daq.cu', options=cuda_options, - include_source_directory=False) - self.gpu_funcs = GPUFuncs(self.module) - - def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange): - """Setup GPU arrays to hold PDF information. - - max_pmt_id: int, largest PMT id # - tbins: number of time bins - trange: tuple of (min, max) time in PDF - qbins: number of charge bins - qrange: tuple of (min, max) charge in PDF - """ - self.events_in_histogram = 0 - self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32) - self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), - dtype=np.uint32) - self.tbins = tbins - self.trange = trange - self.qbins = qbins - self.qrange = qrange - - def clear_pdf(self): - """Rezero the PDF counters.""" - self.hitcount_gpu.fill(0) - self.pdf_gpu.fill(0) - - def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): - self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)), - gpuchannels.q, - gpuchannels.t, - self.hitcount_gpu, - np.int32(self.tbins), - np.float32(self.trange[0]), - np.float32(self.trange[1]), - np.int32(self.qbins), - np.float32(self.qrange[0]), - np.float32(self.qrange[1]), - self.pdf_gpu, - block=(nthreads_per_block,1,1), - grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) - - - self.events_in_histogram += 1 - - def get_pdfs(self): - """Returns the 1D hitcount array and the 3D [channel, time, charge] - histogram.""" - return self.hitcount_gpu.get(), self.pdf_gpu.get() - - def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, - trange, min_qwidth, qrange, min_bin_content=10, - time_only=True): - """Setup GPU arrays to compute PDF values for the given event. - The pdf_eval calculation allows the PDF to be evaluated at a - single point for each channel as the Monte Carlo is run. The - effective bin size will be as small as (`min_twidth`, - `min_qwidth`) around the point of interest, but will be large - enough to ensure that `min_bin_content` Monte Carlo events - fall into the bin. - - event_hit: ndarray - Hit or not-hit status for each channel in the detector. - event_time: ndarray - Hit time for each channel in the detector. If channel - not hit, the time will be ignored. - event_charge: ndarray - Integrated charge for each channel in the detector. - If channel not hit, the charge will be ignored. - - min_twidth: float - Minimum bin size in the time dimension - trange: (float, float) - Range of time dimension in PDF - min_qwidth: float - Minimum bin size in charge dimension - qrange: (float, float) - Range of charge dimension in PDF - min_bin_content: int - The bin will be expanded to include at least this many events - time_only: bool - If True, only the time observable will be used in the PDF. - """ - self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) - self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) - self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) - - self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) - self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) - self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, - dtype=np.float32) - self.nearest_mc_gpu.fill(1e9) - - self.min_twidth = min_twidth - self.trange = trange - self.min_qwidth = min_qwidth - self.qrange = qrange - self.min_bin_content = min_bin_content - self.time_only = time_only - - def clear_pdf_eval(self): - "Reset PDF evaluation counters to start accumulating new Monte Carlo." - self.eval_hitcount_gpu.fill(0) - self.eval_bincount_gpu.fill(0) - self.nearest_mc_gpu.fill(1e9) - - def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64): - "Add the most recent results of run_daq() to the PDF evaluation." - self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only), - np.int32(len(self.event_hit_gpu)), - self.event_hit_gpu, - self.event_time_gpu, - self.event_charge_gpu, - gpuchannels.t, - gpuchannels.q, - self.eval_hitcount_gpu, - self.eval_bincount_gpu, - np.float32(self.min_twidth), - np.float32(self.trange[0]), - np.float32(self.trange[1]), - np.float32(self.min_qwidth), - np.float32(self.qrange[0]), - np.float32(self.qrange[1]), - self.nearest_mc_gpu, - np.int32(self.min_bin_content), - block=(nthreads_per_block,1,1), - grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) - - def get_pdf_eval(self): - evhit = self.event_hit_gpu.get().astype(bool) - hitcount = self.eval_hitcount_gpu.get() - bincount = self.eval_bincount_gpu.get() - - pdf_value = np.zeros(len(hitcount), dtype=float) - pdf_frac_uncert = np.zeros_like(pdf_value) - - # PDF value for high stats bins - high_stats = bincount >= self.min_bin_content - if high_stats.any(): - if self.time_only: - pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth - else: - assert Exception('Unimplemented 2D (time,charge) mode!') - - pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats]) - - # PDF value for low stats bins - low_stats = ~high_stats & (hitcount > 0) & evhit - - nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content)) - - # Deal with the case where we did not even get min_bin_content events - # in the PDF but also clamp the lower range to ensure we don't index - # by a negative number in 2 lines - last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) - distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] - - if low_stats.any(): - if self.time_only: - pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0 - else: - assert Exception('Unimplemented 2D (time,charge) mode!') - - pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1) - - # PDFs with no stats got zero by default during array creation - - print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() - - return hitcount, pdf_value, pdf_value * pdf_frac_uncert - -def create_cuda_context(device_id=None): - """Initialize and return a CUDA context on the specified device. - If device_id is None, the default device is used.""" - if device_id is None: - try: - context = pycuda.tools.make_default_context() - except cuda.LogicError: - # initialize cuda - cuda.init() - context = pycuda.tools.make_default_context() - else: - try: - device = cuda.Device(device_id) - except cuda.LogicError: - # initialize cuda - cuda.init() - device = cuda.Device(device_id) - context = device.make_context() - - context.set_cache_config(cuda.func_cache.PREFER_L1) - - return context |