diff options
-rw-r--r-- | chroma/gpu.py | 799 | ||||
-rw-r--r-- | chroma/gpu/__init__.py | 5 | ||||
-rw-r--r-- | chroma/gpu/daq.py | 49 | ||||
-rw-r--r-- | chroma/gpu/geometry.py | 174 | ||||
-rw-r--r-- | chroma/gpu/pdf.py | 177 | ||||
-rw-r--r-- | chroma/gpu/photon.py | 175 | ||||
-rw-r--r-- | chroma/gpu/render.py | 66 | ||||
-rw-r--r-- | chroma/gpu/tools.py | 180 |
8 files changed, 826 insertions, 799 deletions
diff --git a/chroma/gpu.py b/chroma/gpu.py deleted file mode 100644 index af83c75..0000000 --- a/chroma/gpu.py +++ /dev/null @@ -1,799 +0,0 @@ -import numpy as np -import numpy.ma as ma -from copy import copy -from itertools import izip -import os -import sys -import gc - -import pytools -import pycuda.tools -from pycuda.compiler import SourceModule -from pycuda import characterize -import pycuda.driver as cuda -from pycuda import gpuarray as ga - -import chroma.cuda -from chroma.tools import timeit, profile_if_possible -from chroma.geometry import standard_wavelengths -from chroma import event - -cuda.init() - -# standard nvcc options -cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v'] - -@pycuda.tools.context_dependent_memoize -def get_cu_module(name, options=None, include_source_directory=True): - """Returns a pycuda.compiler.SourceModule object from a CUDA source file - located in the chroma cuda directory at cuda/[name].""" - if options is None: - options = [] - elif isinstance(options, tuple): - options = list(options) - else: - raise TypeError('`options` must be a tuple.') - - from chroma.cuda import srcdir - - if include_source_directory: - options += ['-I' + srcdir] - - with open('%s/%s' % (srcdir, name)) as f: - source = f.read() - - return pycuda.compiler.SourceModule(source, options=options, - no_extern_c=True) - -@pycuda.tools.memoize -def get_cu_source(name): - """Get the source code for a CUDA source file located in the chroma cuda - directory at src/[name].""" - from chroma.cuda import srcdir - with open('%s/%s' % (srcdir, name)) as f: - source = f.read() - return source - -class GPUFuncs(object): - """Simple container class for GPU functions as attributes.""" - def __init__(self, module): - self.module = module - self.funcs = {} - - def __getattr__(self, name): - try: - return self.funcs[name] - except KeyError: - f = self.module.get_function(name) - self.funcs[name] = f - return f - -init_rng_src = """ -#include <curand_kernel.h> - -extern "C" -{ - -__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) -{ - int id = blockIdx.x*blockDim.x + threadIdx.x; - - if (id >= nthreads) - return; - - curand_init(seed, id, offset, &s[id]); -} - -} // extern "C" -""" - -def get_rng_states(size, seed=1): - "Return `size` number of CUDA random number generator states." - rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - - module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) - init_rng = module.get_function('init_rng') - - init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) - - return rng_states - -def to_float3(arr): - "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array." - if not arr.flags['C_CONTIGUOUS']: - arr = np.asarray(arr, order='c') - return arr.astype(np.float32).view(ga.vec.float3)[:,0] - -def to_uint3(arr): - "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array." - if not arr.flags['C_CONTIGUOUS']: - arr = np.asarray(arr, order='c') - return arr.astype(np.uint32).view(ga.vec.uint3)[:,0] - -def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): - """Iterator that yields tuples with the values requried to process - a long array in multiple kernel passes on the GPU. - - Each yielded value is of the form, - (first_index, elements_this_iteration, nblocks_this_iteration) - - Example: - >>> list(chunk_iterator(300, 32, 2)) - [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] - """ - first = 0 - while first < nelements: - elements_left = nelements - first - blocks = int(elements_left // nthreads_per_block) - if elements_left % nthreads_per_block != 0: - blocks += 1 # Round up only if needed - blocks = min(max_blocks, blocks) - elements_this_round = min(elements_left, blocks * nthreads_per_block) - - yield (first, elements_this_round, blocks) - first += elements_this_round - -class GPUPhotons(object): - def __init__(self, photons, ncopies=1): - """Load ``photons`` onto the GPU, replicating as requested. - - Args: - - photons: chroma.Event.Photons - Photon state information to load onto GPU - - ncopies: int, *optional* - Number of times to replicate the photons - on the GPU. This is used if you want - to propagate the same event many times, - for example in a likelihood calculation. - - The amount of GPU storage will be proportionally - larger if ncopies > 1, so be careful. - """ - nphotons = len(photons) - self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) - self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) - self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) - self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) - self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) - self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) - self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) - - # Assign the provided photons to the beginning (possibly - # the entire array if ncopies is 1 - self.pos[:nphotons].set(to_float3(photons.pos)) - self.dir[:nphotons].set(to_float3(photons.dir)) - self.pol[:nphotons].set(to_float3(photons.pol)) - self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) - self.t[:nphotons].set(photons.t.astype(np.float32)) - self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) - self.flags[:nphotons].set(photons.flags.astype(np.uint32)) - - module = get_cu_module('propagate.cu', options=cuda_options) - self.gpu_funcs = GPUFuncs(module) - - # Replicate the photons to the rest of the slots if needed - if ncopies > 1: - max_blocks = 1024 - nthreads_per_block = 64 - for first_photon, photons_this_round, blocks in \ - chunk_iterator(nphotons, nthreads_per_block, max_blocks): - self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), - self.pos, self.dir, self.wavelengths, self.pol, self.t, - self.flags, self.last_hit_triangles, - np.int32(ncopies-1), - np.int32(nphotons), - block=(nthreads_per_block,1,1), grid=(blocks, 1)) - - - # Save the duplication information for the iterate_copies() method - self.true_nphotons = nphotons - self.ncopies = ncopies - - def get(self): - pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) - dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) - pol = self.pol.get().view(np.float32).reshape((len(self.pol),3)) - wavelengths = self.wavelengths.get() - t = self.t.get() - last_hit_triangles = self.last_hit_triangles.get() - flags = self.flags.get() - return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) - - def iterate_copies(self): - '''Returns an iterator that yields GPUPhotonsSlice objects - corresponding to the event copies stored in ``self``.''' - for i in xrange(self.ncopies): - window = slice(self.true_nphotons*i, self.true_nphotons*(i+1)) - yield GPUPhotonsSlice(pos=self.pos[window], - dir=self.dir[window], - pol=self.pol[window], - wavelengths=self.wavelengths[window], - t=self.t[window], - last_hit_triangles=self.last_hit_triangles[window], - flags=self.flags[window]) - - @profile_if_possible - def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, - max_blocks=1024, max_steps=10): - """Propagate photons on GPU to termination or max_steps, whichever - comes first. - - May be called repeatedly without reloading photon information if - single-stepping through photon history. - - ..warning:: - `rng_states` must have at least `nthreads_per_block`*`max_blocks` - number of curandStates. - """ - nphotons = self.pos.size - step = 0 - input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) - input_queue[0] = 0 - # Order photons initially in the queue to put the clones next to each other - for copy in xrange(self.ncopies): - input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons - input_queue_gpu = ga.to_gpu(input_queue) - output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) - output_queue[0] = 1 - output_queue_gpu = ga.to_gpu(output_queue) - - while step < max_steps: - # Just finish the rest of the steps if the # of photons is low - if nphotons < nthreads_per_block * 16 * 8: - nsteps = max_steps - step - else: - nsteps = 1 - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(nphotons, nthreads_per_block, max_blocks): - self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) - - step += nsteps - - if step < max_steps: - temp = input_queue_gpu - input_queue_gpu = output_queue_gpu - output_queue_gpu = temp - output_queue_gpu[:1].set(np.uint32(1)) - nphotons = input_queue_gpu[:1].get()[0] - 1 - - if ga.max(self.flags).get() & (1 << 31): - print >>sys.stderr, "WARNING: ABORTED PHOTONS" - - def __del__(self): - del self.pos - del self.dir - del self.pol - del self.wavelengths - del self.t - del self.flags - del self.last_hit_triangles - # Free up GPU memory quickly if now available - gc.collect() - -class GPUPhotonsSlice(GPUPhotons): - '''A `slice`-like view of a subrange of another GPU photons array. - Works exactly like an instance of GPUPhotons, but the GPU storage - is taken from another GPUPhotons instance. - - Returned by the GPUPhotons.iterate_copies() iterator.''' - def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles, - flags): - '''Create new object using slices of GPUArrays from an instance - of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!''' - self.pos = pos - self.dir = dir - self.pol = pol - self.wavelengths = wavelengths - self.t = t - self.last_hit_triangles = last_hit_triangles - self.flags = flags - - module = get_cu_module('propagate.cu', options=cuda_options) - self.gpu_funcs = GPUFuncs(module) - - self.true_nphotons = len(pos) - self.ncopies = 1 - - def __del__(self): - pass # Do nothing, because we don't own any of our GPU memory - -class GPUChannels(object): - def __init__(self, t, q, flags): - self.t = t - self.q = q - self.flags = flags - - def get(self): - t = self.t.get() - q = self.q.get().astype(np.float32) - - # For now, assume all channels with small - # enough hit time were hit. - return event.Channels(t<1e8, t, q, self.flags.get()) - -class GPURays(object): - """The GPURays class holds arrays of ray positions and directions - on the GPU that are used to render a geometry.""" - def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): - self.pos = ga.to_gpu(to_float3(pos)) - self.dir = ga.to_gpu(to_float3(dir)) - - self.max_alpha_depth = max_alpha_depth - - self.nblocks = nblocks - - transform_module = get_cu_module('transform.cu', options=cuda_options) - self.transform_funcs = GPUFuncs(transform_module) - - render_module = get_cu_module('render.cu', options=cuda_options) - self.render_funcs = GPUFuncs(render_module) - - self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) - self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) - self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32) - - def rotate(self, phi, n): - "Rotate by an angle phi around the axis `n`." - self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) - - def rotate_around_point(self, phi, n, point): - """"Rotate by an angle phi around the axis `n` passing through - the point `point`.""" - self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) - - def translate(self, v): - "Translate the ray positions by the vector `v`." - self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - - def render(self, gpu_geometry, pixels, alpha_depth=10, - keep_last_render=False): - """Render `gpu_geometry` and fill the GPU array `pixels` with pixel - colors.""" - if not keep_last_render: - self.dxlen.fill(0) - - if alpha_depth > self.max_alpha_depth: - raise Exception('alpha_depth > max_alpha_depth') - - if not isinstance(pixels, ga.GPUArray): - raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray) - - if pixels.size != self.pos.size: - raise ValueError('`pixels`.size != number of rays') - - self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - - def snapshot(self, gpu_geometry, alpha_depth=10): - "Render `gpu_geometry` and return a numpy array of pixel colors." - pixels = ga.empty(self.pos.size, dtype=np.uint32) - self.render(gpu_geometry, pixels, alpha_depth) - return pixels.get() - -def make_gpu_struct(size, members): - struct = cuda.mem_alloc(size) - - i = 0 - for member in members: - if isinstance(member, ga.GPUArray): - member = member.gpudata - - if isinstance(member, cuda.DeviceAllocation): - if i % 8: - raise Exception('cannot align 64-bit pointer. ' - 'arrange struct member variables in order of ' - 'decreasing size.') - - cuda.memcpy_htod(int(struct)+i, np.intp(int(member))) - i += 8 - elif np.isscalar(member): - cuda.memcpy_htod(int(struct)+i, member) - i += member.nbytes - else: - raise TypeError('expected a GPU device pointer or scalar type.') - - return struct - -def format_size(size): - if size < 1e3: - return '%.1f%s' % (size, ' ') - elif size < 1e6: - return '%.1f%s' % (size/1e3, 'K') - elif size < 1e9: - return '%.1f%s' % (size/1e6, 'M') - else: - return '%.1f%s' % (size/1e9, 'G') - -def format_array(name, array): - return '%-15s %6s %6s' % \ - (name, format_size(len(array)), format_size(array.nbytes)) - -class GPUGeometry(object): - def __init__(self, geometry, wavelengths=None, print_usage=False): - if wavelengths is None: - wavelengths = standard_wavelengths - - try: - wavelength_step = np.unique(np.diff(wavelengths)).item() - except ValueError: - raise ValueError('wavelengths must be equally spaced apart.') - - geometry_source = get_cu_source('geometry.h') - material_struct_size = characterize.sizeof('Material', geometry_source) - surface_struct_size = characterize.sizeof('Surface', geometry_source) - geometry_struct_size = characterize.sizeof('Geometry', geometry_source) - - self.material_data = [] - self.material_ptrs = [] - - def interp_material_property(wavelengths, property): - # note that it is essential that the material properties be - # interpolated linearly. this fact is used in the propagation - # code to guarantee that probabilities still sum to one. - return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) - - for i in range(len(geometry.unique_materials)): - material = geometry.unique_materials[i] - - if material is None: - raise Exception('one or more triangles is missing a material.') - - refractive_index = interp_material_property(wavelengths, material.refractive_index) - refractive_index_gpu = ga.to_gpu(refractive_index) - absorption_length = interp_material_property(wavelengths, material.absorption_length) - absorption_length_gpu = ga.to_gpu(absorption_length) - scattering_length = interp_material_property(wavelengths, material.scattering_length) - scattering_length_gpu = ga.to_gpu(scattering_length) - - self.material_data.append(refractive_index_gpu) - self.material_data.append(absorption_length_gpu) - self.material_data.append(scattering_length_gpu) - - material_gpu = \ - make_gpu_struct(material_struct_size, - [refractive_index_gpu, absorption_length_gpu, - scattering_length_gpu, - np.uint32(len(wavelengths)), - np.float32(wavelength_step), - np.float32(wavelengths[0])]) - - self.material_ptrs.append(material_gpu) - - self.material_pointer_array = \ - make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) - - self.surface_data = [] - self.surface_ptrs = [] - - for i in range(len(geometry.unique_surfaces)): - surface = geometry.unique_surfaces[i] - - if surface is None: - # need something to copy to the surface array struct - # that is the same size as a 64-bit pointer. - # this pointer will never be used by the simulation. - self.surface_ptrs.append(np.uint64(0)) - continue - - detect = interp_material_property(wavelengths, surface.detect) - detect_gpu = ga.to_gpu(detect) - absorb = interp_material_property(wavelengths, surface.absorb) - absorb_gpu = ga.to_gpu(absorb) - reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) - reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) - reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) - reflect_specular_gpu = ga.to_gpu(reflect_specular) - - self.surface_data.append(detect_gpu) - self.surface_data.append(absorb_gpu) - self.surface_data.append(reflect_diffuse_gpu) - self.surface_data.append(reflect_specular_gpu) - - surface_gpu = \ - make_gpu_struct(surface_struct_size, - [detect_gpu, absorb_gpu, - reflect_diffuse_gpu, - reflect_specular_gpu, - np.uint32(len(wavelengths)), - np.float32(wavelength_step), - np.float32(wavelengths[0])]) - - self.surface_ptrs.append(surface_gpu) - - self.surface_pointer_array = \ - make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) - - self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices)) - self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles)) - - material_codes = (((geometry.material1_index & 0xff) << 24) | - ((geometry.material2_index & 0xff) << 16) | - ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) - - self.material_codes = ga.to_gpu(material_codes) - - self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds)) - self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds)) - self.colors = ga.to_gpu(geometry.colors.astype(np.uint32)) - self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32)) - self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32)) - self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) - - self.gpudata = make_gpu_struct(geometry_struct_size, - [self.vertices, self.triangles, - self.material_codes, - self.colors, self.lower_bounds, - self.upper_bounds, self.node_map, - self.node_map_end, - self.material_pointer_array, - self.surface_pointer_array, - np.uint32(geometry.start_node), - np.uint32(geometry.first_node)]) - - self.geometry = geometry - - if print_usage: - self.print_device_usage() - - def print_device_usage(self): - print 'device usage:' - print '-'*10 - print format_array('vertices', self.vertices) - print format_array('triangles', self.triangles) - print format_array('lower_bounds', self.lower_bounds) - print format_array('upper_bounds', self.upper_bounds) - print format_array('node_map', self.node_map) - print format_array('node_map_end', self.node_map_end) - print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes)) - print '-'*10 - free, total = cuda.mem_get_info() - print '%-15s %6s %6s' % ('device total', '', format_size(total)) - print '%-15s %6s %6s' % ('device used', '', format_size(total-free)) - print '%-15s %6s %6s' % ('device free', '', format_size(free)) - print - - def reset_colors(self): - self.colors.set_async(self.geometry.colors.astype(np.uint32)) - - def color_solids(self, solid_hit, colors, nblocks_per_thread=64, - max_blocks=1024): - solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) - solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) - - module = get_cu_module('mesh.h', options=cuda_options) - color_solids = module.get_function('color_solids') - - for first_triangle, triangles_this_round, blocks in \ - chunk_iterator(self.triangles.size, nblocks_per_thread, - max_blocks): - color_solids(np.int32(first_triangle), - np.int32(triangles_this_round), self.solid_id_map, - solid_hit_gpu, solid_colors_gpu, self.gpudata, - block=(nblocks_per_thread,1,1), - grid=(blocks,1)) - -class GPUDaq(object): - def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9): - self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32) - self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32) - self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) - self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu) - self.daq_pmt_rms = pmt_rms - self.solid_id_map_gpu = gpu_geometry.solid_id_map - - self.module = get_cu_module('daq.cu', options=cuda_options, - include_source_directory=False) - self.gpu_funcs = GPUFuncs(self.module) - - def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): - self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) - self.channel_q_gpu.fill(0) - self.channel_history_gpu.fill(0) - - n = len(gpuphotons.pos) - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(n, nthreads_per_block, max_blocks): - self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1)) - - self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) - - return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu) - -class GPUPDF(object): - def __init__(self): - self.module = get_cu_module('daq.cu', options=cuda_options, - include_source_directory=False) - self.gpu_funcs = GPUFuncs(self.module) - - def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange): - """Setup GPU arrays to hold PDF information. - - max_pmt_id: int, largest PMT id # - tbins: number of time bins - trange: tuple of (min, max) time in PDF - qbins: number of charge bins - qrange: tuple of (min, max) charge in PDF - """ - self.events_in_histogram = 0 - self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32) - self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), - dtype=np.uint32) - self.tbins = tbins - self.trange = trange - self.qbins = qbins - self.qrange = qrange - - def clear_pdf(self): - """Rezero the PDF counters.""" - self.hitcount_gpu.fill(0) - self.pdf_gpu.fill(0) - - def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): - self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)), - gpuchannels.q, - gpuchannels.t, - self.hitcount_gpu, - np.int32(self.tbins), - np.float32(self.trange[0]), - np.float32(self.trange[1]), - np.int32(self.qbins), - np.float32(self.qrange[0]), - np.float32(self.qrange[1]), - self.pdf_gpu, - block=(nthreads_per_block,1,1), - grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) - - - self.events_in_histogram += 1 - - def get_pdfs(self): - """Returns the 1D hitcount array and the 3D [channel, time, charge] - histogram.""" - return self.hitcount_gpu.get(), self.pdf_gpu.get() - - def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, - trange, min_qwidth, qrange, min_bin_content=10, - time_only=True): - """Setup GPU arrays to compute PDF values for the given event. - The pdf_eval calculation allows the PDF to be evaluated at a - single point for each channel as the Monte Carlo is run. The - effective bin size will be as small as (`min_twidth`, - `min_qwidth`) around the point of interest, but will be large - enough to ensure that `min_bin_content` Monte Carlo events - fall into the bin. - - event_hit: ndarray - Hit or not-hit status for each channel in the detector. - event_time: ndarray - Hit time for each channel in the detector. If channel - not hit, the time will be ignored. - event_charge: ndarray - Integrated charge for each channel in the detector. - If channel not hit, the charge will be ignored. - - min_twidth: float - Minimum bin size in the time dimension - trange: (float, float) - Range of time dimension in PDF - min_qwidth: float - Minimum bin size in charge dimension - qrange: (float, float) - Range of charge dimension in PDF - min_bin_content: int - The bin will be expanded to include at least this many events - time_only: bool - If True, only the time observable will be used in the PDF. - """ - self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) - self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) - self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) - - self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) - self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) - self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, - dtype=np.float32) - self.nearest_mc_gpu.fill(1e9) - - self.min_twidth = min_twidth - self.trange = trange - self.min_qwidth = min_qwidth - self.qrange = qrange - self.min_bin_content = min_bin_content - self.time_only = time_only - - def clear_pdf_eval(self): - "Reset PDF evaluation counters to start accumulating new Monte Carlo." - self.eval_hitcount_gpu.fill(0) - self.eval_bincount_gpu.fill(0) - self.nearest_mc_gpu.fill(1e9) - - def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64): - "Add the most recent results of run_daq() to the PDF evaluation." - self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only), - np.int32(len(self.event_hit_gpu)), - self.event_hit_gpu, - self.event_time_gpu, - self.event_charge_gpu, - gpuchannels.t, - gpuchannels.q, - self.eval_hitcount_gpu, - self.eval_bincount_gpu, - np.float32(self.min_twidth), - np.float32(self.trange[0]), - np.float32(self.trange[1]), - np.float32(self.min_qwidth), - np.float32(self.qrange[0]), - np.float32(self.qrange[1]), - self.nearest_mc_gpu, - np.int32(self.min_bin_content), - block=(nthreads_per_block,1,1), - grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) - - def get_pdf_eval(self): - evhit = self.event_hit_gpu.get().astype(bool) - hitcount = self.eval_hitcount_gpu.get() - bincount = self.eval_bincount_gpu.get() - - pdf_value = np.zeros(len(hitcount), dtype=float) - pdf_frac_uncert = np.zeros_like(pdf_value) - - # PDF value for high stats bins - high_stats = bincount >= self.min_bin_content - if high_stats.any(): - if self.time_only: - pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth - else: - assert Exception('Unimplemented 2D (time,charge) mode!') - - pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats]) - - # PDF value for low stats bins - low_stats = ~high_stats & (hitcount > 0) & evhit - - nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content)) - - # Deal with the case where we did not even get min_bin_content events - # in the PDF but also clamp the lower range to ensure we don't index - # by a negative number in 2 lines - last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) - distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] - - if low_stats.any(): - if self.time_only: - pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0 - else: - assert Exception('Unimplemented 2D (time,charge) mode!') - - pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1) - - # PDFs with no stats got zero by default during array creation - - print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() - - return hitcount, pdf_value, pdf_value * pdf_frac_uncert - -def create_cuda_context(device_id=None): - """Initialize and return a CUDA context on the specified device. - If device_id is None, the default device is used.""" - if device_id is None: - try: - context = pycuda.tools.make_default_context() - except cuda.LogicError: - # initialize cuda - cuda.init() - context = pycuda.tools.make_default_context() - else: - try: - device = cuda.Device(device_id) - except cuda.LogicError: - # initialize cuda - cuda.init() - device = cuda.Device(device_id) - context = device.make_context() - - context.set_cache_config(cuda.func_cache.PREFER_L1) - - return context diff --git a/chroma/gpu/__init__.py b/chroma/gpu/__init__.py new file mode 100644 index 0000000..8a87fc7 --- /dev/null +++ b/chroma/gpu/__init__.py @@ -0,0 +1,5 @@ +from chroma.gpu.tools import * +from chroma.gpu.geometry import * +from chroma.gpu.photon import * +from chroma.gpu.daq import * +from chroma.gpu.pdf import * diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py new file mode 100644 index 0000000..82958dc --- /dev/null +++ b/chroma/gpu/daq.py @@ -0,0 +1,49 @@ +import numpy as np +from pycuda import gpuarray as ga + +from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \ + chunk_iterator +from chroma import event + +class GPUChannels(object): + def __init__(self, t, q, flags): + self.t = t + self.q = q + self.flags = flags + + def get(self): + t = self.t.get() + q = self.q.get().astype(np.float32) + + # For now, assume all channels with small + # enough hit time were hit. + return event.Channels(t<1e8, t, q, self.flags.get()) + +class GPUDaq(object): + def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9): + self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32) + self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32) + self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) + self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu) + self.daq_pmt_rms = pmt_rms + self.solid_id_map_gpu = gpu_geometry.solid_id_map + + self.module = get_cu_module('daq.cu', options=cuda_options, + include_source_directory=False) + self.gpu_funcs = GPUFuncs(self.module) + + def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): + self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) + self.channel_q_gpu.fill(0) + self.channel_history_gpu.fill(0) + + n = len(gpuphotons.pos) + + for first_photon, photons_this_round, blocks in \ + chunk_iterator(n, nthreads_per_block, max_blocks): + self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1)) + + self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) + + return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu) + diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py new file mode 100644 index 0000000..a223006 --- /dev/null +++ b/chroma/gpu/geometry.py @@ -0,0 +1,174 @@ +import numpy as np +import pycuda.driver as cuda +from pycuda import gpuarray as ga +from pycuda import characterize + +from chroma.geometry import standard_wavelengths +from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \ + chunk_iterator, format_array, format_size, to_uint3, to_float3, \ + make_gpu_struct + +class GPUGeometry(object): + def __init__(self, geometry, wavelengths=None, print_usage=False): + if wavelengths is None: + wavelengths = standard_wavelengths + + try: + wavelength_step = np.unique(np.diff(wavelengths)).item() + except ValueError: + raise ValueError('wavelengths must be equally spaced apart.') + + geometry_source = get_cu_source('geometry.h') + material_struct_size = characterize.sizeof('Material', geometry_source) + surface_struct_size = characterize.sizeof('Surface', geometry_source) + geometry_struct_size = characterize.sizeof('Geometry', geometry_source) + + self.material_data = [] + self.material_ptrs = [] + + def interp_material_property(wavelengths, property): + # note that it is essential that the material properties be + # interpolated linearly. this fact is used in the propagation + # code to guarantee that probabilities still sum to one. + return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) + + for i in range(len(geometry.unique_materials)): + material = geometry.unique_materials[i] + + if material is None: + raise Exception('one or more triangles is missing a material.') + + refractive_index = interp_material_property(wavelengths, material.refractive_index) + refractive_index_gpu = ga.to_gpu(refractive_index) + absorption_length = interp_material_property(wavelengths, material.absorption_length) + absorption_length_gpu = ga.to_gpu(absorption_length) + scattering_length = interp_material_property(wavelengths, material.scattering_length) + scattering_length_gpu = ga.to_gpu(scattering_length) + + self.material_data.append(refractive_index_gpu) + self.material_data.append(absorption_length_gpu) + self.material_data.append(scattering_length_gpu) + + material_gpu = \ + make_gpu_struct(material_struct_size, + [refractive_index_gpu, absorption_length_gpu, + scattering_length_gpu, + np.uint32(len(wavelengths)), + np.float32(wavelength_step), + np.float32(wavelengths[0])]) + + self.material_ptrs.append(material_gpu) + + self.material_pointer_array = \ + make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) + + self.surface_data = [] + self.surface_ptrs = [] + + for i in range(len(geometry.unique_surfaces)): + surface = geometry.unique_surfaces[i] + + if surface is None: + # need something to copy to the surface array struct + # that is the same size as a 64-bit pointer. + # this pointer will never be used by the simulation. + self.surface_ptrs.append(np.uint64(0)) + continue + + detect = interp_material_property(wavelengths, surface.detect) + detect_gpu = ga.to_gpu(detect) + absorb = interp_material_property(wavelengths, surface.absorb) + absorb_gpu = ga.to_gpu(absorb) + reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) + reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) + reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) + reflect_specular_gpu = ga.to_gpu(reflect_specular) + + self.surface_data.append(detect_gpu) + self.surface_data.append(absorb_gpu) + self.surface_data.append(reflect_diffuse_gpu) + self.surface_data.append(reflect_specular_gpu) + + surface_gpu = \ + make_gpu_struct(surface_struct_size, + [detect_gpu, absorb_gpu, + reflect_diffuse_gpu, + reflect_specular_gpu, + np.uint32(len(wavelengths)), + np.float32(wavelength_step), + np.float32(wavelengths[0])]) + + self.surface_ptrs.append(surface_gpu) + + self.surface_pointer_array = \ + make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) + + self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices)) + self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles)) + + material_codes = (((geometry.material1_index & 0xff) << 24) | + ((geometry.material2_index & 0xff) << 16) | + ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) + + self.material_codes = ga.to_gpu(material_codes) + + self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds)) + self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds)) + self.colors = ga.to_gpu(geometry.colors.astype(np.uint32)) + self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32)) + self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32)) + self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) + + self.gpudata = make_gpu_struct(geometry_struct_size, + [self.vertices, self.triangles, + self.material_codes, + self.colors, self.lower_bounds, + self.upper_bounds, self.node_map, + self.node_map_end, + self.material_pointer_array, + self.surface_pointer_array, + np.uint32(geometry.start_node), + np.uint32(geometry.first_node)]) + + self.geometry = geometry + + if print_usage: + self.print_device_usage() + + def print_device_usage(self): + print 'device usage:' + print '-'*10 + print format_array('vertices', self.vertices) + print format_array('triangles', self.triangles) + print format_array('lower_bounds', self.lower_bounds) + print format_array('upper_bounds', self.upper_bounds) + print format_array('node_map', self.node_map) + print format_array('node_map_end', self.node_map_end) + print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes)) + print '-'*10 + free, total = cuda.mem_get_info() + print '%-15s %6s %6s' % ('device total', '', format_size(total)) + print '%-15s %6s %6s' % ('device used', '', format_size(total-free)) + print '%-15s %6s %6s' % ('device free', '', format_size(free)) + print + + def reset_colors(self): + self.colors.set_async(self.geometry.colors.astype(np.uint32)) + + def color_solids(self, solid_hit, colors, nblocks_per_thread=64, + max_blocks=1024): + solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) + solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) + + module = get_cu_module('mesh.h', options=cuda_options) + color_solids = module.get_function('color_solids') + + for first_triangle, triangles_this_round, blocks in \ + chunk_iterator(self.triangles.size, nblocks_per_thread, + max_blocks): + color_solids(np.int32(first_triangle), + np.int32(triangles_this_round), self.solid_id_map, + solid_hit_gpu, solid_colors_gpu, self.gpudata, + block=(nblocks_per_thread,1,1), + grid=(blocks,1)) + diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py new file mode 100644 index 0000000..053d5f3 --- /dev/null +++ b/chroma/gpu/pdf.py @@ -0,0 +1,177 @@ +import numpy as np +from pycuda import gpuarray as ga + +from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs + +class GPUPDF(object): + def __init__(self): + self.module = get_cu_module('daq.cu', options=cuda_options, + include_source_directory=False) + self.gpu_funcs = GPUFuncs(self.module) + + def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange): + """Setup GPU arrays to hold PDF information. + + max_pmt_id: int, largest PMT id # + tbins: number of time bins + trange: tuple of (min, max) time in PDF + qbins: number of charge bins + qrange: tuple of (min, max) charge in PDF + """ + self.events_in_histogram = 0 + self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32) + self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins), + dtype=np.uint32) + self.tbins = tbins + self.trange = trange + self.qbins = qbins + self.qrange = qrange + + def clear_pdf(self): + """Rezero the PDF counters.""" + self.hitcount_gpu.fill(0) + self.pdf_gpu.fill(0) + + def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): + self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)), + gpuchannels.q, + gpuchannels.t, + self.hitcount_gpu, + np.int32(self.tbins), + np.float32(self.trange[0]), + np.float32(self.trange[1]), + np.int32(self.qbins), + np.float32(self.qrange[0]), + np.float32(self.qrange[1]), + self.pdf_gpu, + block=(nthreads_per_block,1,1), + grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) + + + self.events_in_histogram += 1 + + def get_pdfs(self): + """Returns the 1D hitcount array and the 3D [channel, time, charge] + histogram.""" + return self.hitcount_gpu.get(), self.pdf_gpu.get() + + def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, + trange, min_qwidth, qrange, min_bin_content=10, + time_only=True): + """Setup GPU arrays to compute PDF values for the given event. + The pdf_eval calculation allows the PDF to be evaluated at a + single point for each channel as the Monte Carlo is run. The + effective bin size will be as small as (`min_twidth`, + `min_qwidth`) around the point of interest, but will be large + enough to ensure that `min_bin_content` Monte Carlo events + fall into the bin. + + event_hit: ndarray + Hit or not-hit status for each channel in the detector. + event_time: ndarray + Hit time for each channel in the detector. If channel + not hit, the time will be ignored. + event_charge: ndarray + Integrated charge for each channel in the detector. + If channel not hit, the charge will be ignored. + + min_twidth: float + Minimum bin size in the time dimension + trange: (float, float) + Range of time dimension in PDF + min_qwidth: float + Minimum bin size in charge dimension + qrange: (float, float) + Range of charge dimension in PDF + min_bin_content: int + The bin will be expanded to include at least this many events + time_only: bool + If True, only the time observable will be used in the PDF. + """ + self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) + self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) + self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) + + self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) + self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) + self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, + dtype=np.float32) + self.nearest_mc_gpu.fill(1e9) + + self.min_twidth = min_twidth + self.trange = trange + self.min_qwidth = min_qwidth + self.qrange = qrange + self.min_bin_content = min_bin_content + self.time_only = time_only + + def clear_pdf_eval(self): + "Reset PDF evaluation counters to start accumulating new Monte Carlo." + self.eval_hitcount_gpu.fill(0) + self.eval_bincount_gpu.fill(0) + self.nearest_mc_gpu.fill(1e9) + + def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64): + "Add the most recent results of run_daq() to the PDF evaluation." + self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only), + np.int32(len(self.event_hit_gpu)), + self.event_hit_gpu, + self.event_time_gpu, + self.event_charge_gpu, + gpuchannels.t, + gpuchannels.q, + self.eval_hitcount_gpu, + self.eval_bincount_gpu, + np.float32(self.min_twidth), + np.float32(self.trange[0]), + np.float32(self.trange[1]), + np.float32(self.min_qwidth), + np.float32(self.qrange[0]), + np.float32(self.qrange[1]), + self.nearest_mc_gpu, + np.int32(self.min_bin_content), + block=(nthreads_per_block,1,1), + grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) + + def get_pdf_eval(self): + evhit = self.event_hit_gpu.get().astype(bool) + hitcount = self.eval_hitcount_gpu.get() + bincount = self.eval_bincount_gpu.get() + + pdf_value = np.zeros(len(hitcount), dtype=float) + pdf_frac_uncert = np.zeros_like(pdf_value) + + # PDF value for high stats bins + high_stats = bincount >= self.min_bin_content + if high_stats.any(): + if self.time_only: + pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth + else: + assert Exception('Unimplemented 2D (time,charge) mode!') + + pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats]) + + # PDF value for low stats bins + low_stats = ~high_stats & (hitcount > 0) & evhit + + nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content)) + + # Deal with the case where we did not even get min_bin_content events + # in the PDF but also clamp the lower range to ensure we don't index + # by a negative number in 2 lines + last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) + distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] + + if low_stats.any(): + if self.time_only: + pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0 + else: + assert Exception('Unimplemented 2D (time,charge) mode!') + + pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1) + + # PDFs with no stats got zero by default during array creation + + print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() + + return hitcount, pdf_value, pdf_value * pdf_frac_uncert diff --git a/chroma/gpu/photon.py b/chroma/gpu/photon.py new file mode 100644 index 0000000..2a9a1ff --- /dev/null +++ b/chroma/gpu/photon.py @@ -0,0 +1,175 @@ +import numpy as np +import sys +import gc +from pycuda import gpuarray as ga + +from chroma.tools import profile_if_possible +from chroma import event +from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \ + chunk_iterator, to_float3 + + +class GPUPhotons(object): + def __init__(self, photons, ncopies=1): + """Load ``photons`` onto the GPU, replicating as requested. + + Args: + - photons: chroma.Event.Photons + Photon state information to load onto GPU + - ncopies: int, *optional* + Number of times to replicate the photons + on the GPU. This is used if you want + to propagate the same event many times, + for example in a likelihood calculation. + + The amount of GPU storage will be proportionally + larger if ncopies > 1, so be careful. + """ + nphotons = len(photons) + self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) + self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) + self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) + self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) + self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) + self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) + self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) + + # Assign the provided photons to the beginning (possibly + # the entire array if ncopies is 1 + self.pos[:nphotons].set(to_float3(photons.pos)) + self.dir[:nphotons].set(to_float3(photons.dir)) + self.pol[:nphotons].set(to_float3(photons.pol)) + self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) + self.t[:nphotons].set(photons.t.astype(np.float32)) + self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) + self.flags[:nphotons].set(photons.flags.astype(np.uint32)) + + module = get_cu_module('propagate.cu', options=cuda_options) + self.gpu_funcs = GPUFuncs(module) + + # Replicate the photons to the rest of the slots if needed + if ncopies > 1: + max_blocks = 1024 + nthreads_per_block = 64 + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), + self.pos, self.dir, self.wavelengths, self.pol, self.t, + self.flags, self.last_hit_triangles, + np.int32(ncopies-1), + np.int32(nphotons), + block=(nthreads_per_block,1,1), grid=(blocks, 1)) + + + # Save the duplication information for the iterate_copies() method + self.true_nphotons = nphotons + self.ncopies = ncopies + + def get(self): + pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) + dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) + pol = self.pol.get().view(np.float32).reshape((len(self.pol),3)) + wavelengths = self.wavelengths.get() + t = self.t.get() + last_hit_triangles = self.last_hit_triangles.get() + flags = self.flags.get() + return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) + + def iterate_copies(self): + '''Returns an iterator that yields GPUPhotonsSlice objects + corresponding to the event copies stored in ``self``.''' + for i in xrange(self.ncopies): + window = slice(self.true_nphotons*i, self.true_nphotons*(i+1)) + yield GPUPhotonsSlice(pos=self.pos[window], + dir=self.dir[window], + pol=self.pol[window], + wavelengths=self.wavelengths[window], + t=self.t[window], + last_hit_triangles=self.last_hit_triangles[window], + flags=self.flags[window]) + + @profile_if_possible + def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, + max_blocks=1024, max_steps=10): + """Propagate photons on GPU to termination or max_steps, whichever + comes first. + + May be called repeatedly without reloading photon information if + single-stepping through photon history. + + ..warning:: + `rng_states` must have at least `nthreads_per_block`*`max_blocks` + number of curandStates. + """ + nphotons = self.pos.size + step = 0 + input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) + input_queue[0] = 0 + # Order photons initially in the queue to put the clones next to each other + for copy in xrange(self.ncopies): + input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons + input_queue_gpu = ga.to_gpu(input_queue) + output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) + output_queue[0] = 1 + output_queue_gpu = ga.to_gpu(output_queue) + + while step < max_steps: + # Just finish the rest of the steps if the # of photons is low + if nphotons < nthreads_per_block * 16 * 8: + nsteps = max_steps - step + else: + nsteps = 1 + + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) + + step += nsteps + + if step < max_steps: + temp = input_queue_gpu + input_queue_gpu = output_queue_gpu + output_queue_gpu = temp + output_queue_gpu[:1].set(np.uint32(1)) + nphotons = input_queue_gpu[:1].get()[0] - 1 + + if ga.max(self.flags).get() & (1 << 31): + print >>sys.stderr, "WARNING: ABORTED PHOTONS" + + def __del__(self): + del self.pos + del self.dir + del self.pol + del self.wavelengths + del self.t + del self.flags + del self.last_hit_triangles + # Free up GPU memory quickly if now available + gc.collect() + +class GPUPhotonsSlice(GPUPhotons): + '''A `slice`-like view of a subrange of another GPU photons array. + Works exactly like an instance of GPUPhotons, but the GPU storage + is taken from another GPUPhotons instance. + + Returned by the GPUPhotons.iterate_copies() iterator.''' + def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles, + flags): + '''Create new object using slices of GPUArrays from an instance + of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!''' + self.pos = pos + self.dir = dir + self.pol = pol + self.wavelengths = wavelengths + self.t = t + self.last_hit_triangles = last_hit_triangles + self.flags = flags + + module = get_cu_module('propagate.cu', options=cuda_options) + self.gpu_funcs = GPUFuncs(module) + + self.true_nphotons = len(pos) + self.ncopies = 1 + + def __del__(self): + pass # Do nothing, because we don't own any of our GPU memory diff --git a/chroma/gpu/render.py b/chroma/gpu/render.py new file mode 100644 index 0000000..e645da4 --- /dev/null +++ b/chroma/gpu/render.py @@ -0,0 +1,66 @@ +import numpy as np +from pycuda import gpuarray as ga + +from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \ + to_float3 + +class GPURays(object): + """The GPURays class holds arrays of ray positions and directions + on the GPU that are used to render a geometry.""" + def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): + self.pos = ga.to_gpu(to_float3(pos)) + self.dir = ga.to_gpu(to_float3(dir)) + + self.max_alpha_depth = max_alpha_depth + + self.nblocks = nblocks + + transform_module = get_cu_module('transform.cu', options=cuda_options) + self.transform_funcs = GPUFuncs(transform_module) + + render_module = get_cu_module('render.cu', options=cuda_options) + self.render_funcs = GPUFuncs(render_module) + + self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) + self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) + self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32) + + def rotate(self, phi, n): + "Rotate by an angle phi around the axis `n`." + self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) + + def rotate_around_point(self, phi, n, point): + """"Rotate by an angle phi around the axis `n` passing through + the point `point`.""" + self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) + + def translate(self, v): + "Translate the ray positions by the vector `v`." + self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + + def render(self, gpu_geometry, pixels, alpha_depth=10, + keep_last_render=False): + """Render `gpu_geometry` and fill the GPU array `pixels` with pixel + colors.""" + if not keep_last_render: + self.dxlen.fill(0) + + if alpha_depth > self.max_alpha_depth: + raise Exception('alpha_depth > max_alpha_depth') + + if not isinstance(pixels, ga.GPUArray): + raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray) + + if pixels.size != self.pos.size: + raise ValueError('`pixels`.size != number of rays') + + self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + + def snapshot(self, gpu_geometry, alpha_depth=10): + "Render `gpu_geometry` and return a numpy array of pixel colors." + pixels = ga.empty(self.pos.size, dtype=np.uint32) + self.render(gpu_geometry, pixels, alpha_depth) + return pixels.get() + diff --git a/chroma/gpu/tools.py b/chroma/gpu/tools.py new file mode 100644 index 0000000..a11a561 --- /dev/null +++ b/chroma/gpu/tools.py @@ -0,0 +1,180 @@ +import numpy as np +import pycuda.tools +from pycuda import characterize +import pycuda.driver as cuda +from pycuda import gpuarray as ga + +from chroma.cuda import srcdir + +cuda.init() + +# standard nvcc options +cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v'] + +@pycuda.tools.context_dependent_memoize +def get_cu_module(name, options=None, include_source_directory=True): + """Returns a pycuda.compiler.SourceModule object from a CUDA source file + located in the chroma cuda directory at cuda/[name].""" + if options is None: + options = [] + elif isinstance(options, tuple): + options = list(options) + else: + raise TypeError('`options` must be a tuple.') + + if include_source_directory: + options += ['-I' + srcdir] + + with open('%s/%s' % (srcdir, name)) as f: + source = f.read() + + return pycuda.compiler.SourceModule(source, options=options, + no_extern_c=True) + +@pycuda.tools.memoize +def get_cu_source(name): + """Get the source code for a CUDA source file located in the chroma cuda + directory at src/[name].""" + with open('%s/%s' % (srcdir, name)) as f: + source = f.read() + return source + +class GPUFuncs(object): + """Simple container class for GPU functions as attributes.""" + def __init__(self, module): + self.module = module + self.funcs = {} + + def __getattr__(self, name): + try: + return self.funcs[name] + except KeyError: + f = self.module.get_function(name) + self.funcs[name] = f + return f + +init_rng_src = """ +#include <curand_kernel.h> + +extern "C" +{ + +__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + curand_init(seed, id, offset, &s[id]); +} + +} // extern "C" +""" + +def get_rng_states(size, seed=1): + "Return `size` number of CUDA random number generator states." + rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) + + module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) + init_rng = module.get_function('init_rng') + + init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) + + return rng_states + +def to_float3(arr): + "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array." + if not arr.flags['C_CONTIGUOUS']: + arr = np.asarray(arr, order='c') + return arr.astype(np.float32).view(ga.vec.float3)[:,0] + +def to_uint3(arr): + "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array." + if not arr.flags['C_CONTIGUOUS']: + arr = np.asarray(arr, order='c') + return arr.astype(np.uint32).view(ga.vec.uint3)[:,0] + +def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): + """Iterator that yields tuples with the values requried to process + a long array in multiple kernel passes on the GPU. + + Each yielded value is of the form, + (first_index, elements_this_iteration, nblocks_this_iteration) + + Example: + >>> list(chunk_iterator(300, 32, 2)) + [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] + """ + first = 0 + while first < nelements: + elements_left = nelements - first + blocks = int(elements_left // nthreads_per_block) + if elements_left % nthreads_per_block != 0: + blocks += 1 # Round up only if needed + blocks = min(max_blocks, blocks) + elements_this_round = min(elements_left, blocks * nthreads_per_block) + + yield (first, elements_this_round, blocks) + first += elements_this_round + +def create_cuda_context(device_id=None): + """Initialize and return a CUDA context on the specified device. + If device_id is None, the default device is used.""" + if device_id is None: + try: + context = pycuda.tools.make_default_context() + except cuda.LogicError: + # initialize cuda + cuda.init() + context = pycuda.tools.make_default_context() + else: + try: + device = cuda.Device(device_id) + except cuda.LogicError: + # initialize cuda + cuda.init() + device = cuda.Device(device_id) + context = device.make_context() + + context.set_cache_config(cuda.func_cache.PREFER_L1) + + return context + +def make_gpu_struct(size, members): + struct = cuda.mem_alloc(size) + + i = 0 + for member in members: + if isinstance(member, ga.GPUArray): + member = member.gpudata + + if isinstance(member, cuda.DeviceAllocation): + if i % 8: + raise Exception('cannot align 64-bit pointer. ' + 'arrange struct member variables in order of ' + 'decreasing size.') + + cuda.memcpy_htod(int(struct)+i, np.intp(int(member))) + i += 8 + elif np.isscalar(member): + cuda.memcpy_htod(int(struct)+i, member) + i += member.nbytes + else: + raise TypeError('expected a GPU device pointer or scalar type.') + + return struct + +def format_size(size): + if size < 1e3: + return '%.1f%s' % (size, ' ') + elif size < 1e6: + return '%.1f%s' % (size/1e3, 'K') + elif size < 1e9: + return '%.1f%s' % (size/1e6, 'M') + else: + return '%.1f%s' % (size/1e9, 'G') + +def format_array(name, array): + return '%-15s %6s %6s' % \ + (name, format_size(len(array)), format_size(array.nbytes)) |