summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2011-09-17 17:52:45 -0400
committerStan Seibert <stan@mtrr.org>2011-09-17 17:52:45 -0400
commitdb5a892ee57ff53f085477fdd77e648099353e88 (patch)
tree94a8cb03dcf26aa23da66716fe5eb4e4936a50cc
parentd37723d2e58c3baa0c973d03a79c4df06d32d6b7 (diff)
downloadchroma-db5a892ee57ff53f085477fdd77e648099353e88.tar.gz
chroma-db5a892ee57ff53f085477fdd77e648099353e88.tar.bz2
chroma-db5a892ee57ff53f085477fdd77e648099353e88.zip
Split up chroma.gpu into separate modules. chroma.gpu.__init__ imports everything from the sub modules, so usage is the same.
-rw-r--r--chroma/gpu.py799
-rw-r--r--chroma/gpu/__init__.py5
-rw-r--r--chroma/gpu/daq.py49
-rw-r--r--chroma/gpu/geometry.py174
-rw-r--r--chroma/gpu/pdf.py177
-rw-r--r--chroma/gpu/photon.py175
-rw-r--r--chroma/gpu/render.py66
-rw-r--r--chroma/gpu/tools.py180
8 files changed, 826 insertions, 799 deletions
diff --git a/chroma/gpu.py b/chroma/gpu.py
deleted file mode 100644
index af83c75..0000000
--- a/chroma/gpu.py
+++ /dev/null
@@ -1,799 +0,0 @@
-import numpy as np
-import numpy.ma as ma
-from copy import copy
-from itertools import izip
-import os
-import sys
-import gc
-
-import pytools
-import pycuda.tools
-from pycuda.compiler import SourceModule
-from pycuda import characterize
-import pycuda.driver as cuda
-from pycuda import gpuarray as ga
-
-import chroma.cuda
-from chroma.tools import timeit, profile_if_possible
-from chroma.geometry import standard_wavelengths
-from chroma import event
-
-cuda.init()
-
-# standard nvcc options
-cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
-
-@pycuda.tools.context_dependent_memoize
-def get_cu_module(name, options=None, include_source_directory=True):
- """Returns a pycuda.compiler.SourceModule object from a CUDA source file
- located in the chroma cuda directory at cuda/[name]."""
- if options is None:
- options = []
- elif isinstance(options, tuple):
- options = list(options)
- else:
- raise TypeError('`options` must be a tuple.')
-
- from chroma.cuda import srcdir
-
- if include_source_directory:
- options += ['-I' + srcdir]
-
- with open('%s/%s' % (srcdir, name)) as f:
- source = f.read()
-
- return pycuda.compiler.SourceModule(source, options=options,
- no_extern_c=True)
-
-@pycuda.tools.memoize
-def get_cu_source(name):
- """Get the source code for a CUDA source file located in the chroma cuda
- directory at src/[name]."""
- from chroma.cuda import srcdir
- with open('%s/%s' % (srcdir, name)) as f:
- source = f.read()
- return source
-
-class GPUFuncs(object):
- """Simple container class for GPU functions as attributes."""
- def __init__(self, module):
- self.module = module
- self.funcs = {}
-
- def __getattr__(self, name):
- try:
- return self.funcs[name]
- except KeyError:
- f = self.module.get_function(name)
- self.funcs[name] = f
- return f
-
-init_rng_src = """
-#include <curand_kernel.h>
-
-extern "C"
-{
-
-__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
-{
- int id = blockIdx.x*blockDim.x + threadIdx.x;
-
- if (id >= nthreads)
- return;
-
- curand_init(seed, id, offset, &s[id]);
-}
-
-} // extern "C"
-"""
-
-def get_rng_states(size, seed=1):
- "Return `size` number of CUDA random number generator states."
- rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-
- module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
- init_rng = module.get_function('init_rng')
-
- init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
-
- return rng_states
-
-def to_float3(arr):
- "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
- if not arr.flags['C_CONTIGUOUS']:
- arr = np.asarray(arr, order='c')
- return arr.astype(np.float32).view(ga.vec.float3)[:,0]
-
-def to_uint3(arr):
- "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
- if not arr.flags['C_CONTIGUOUS']:
- arr = np.asarray(arr, order='c')
- return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
-
-def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
- """Iterator that yields tuples with the values requried to process
- a long array in multiple kernel passes on the GPU.
-
- Each yielded value is of the form,
- (first_index, elements_this_iteration, nblocks_this_iteration)
-
- Example:
- >>> list(chunk_iterator(300, 32, 2))
- [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
- """
- first = 0
- while first < nelements:
- elements_left = nelements - first
- blocks = int(elements_left // nthreads_per_block)
- if elements_left % nthreads_per_block != 0:
- blocks += 1 # Round up only if needed
- blocks = min(max_blocks, blocks)
- elements_this_round = min(elements_left, blocks * nthreads_per_block)
-
- yield (first, elements_this_round, blocks)
- first += elements_this_round
-
-class GPUPhotons(object):
- def __init__(self, photons, ncopies=1):
- """Load ``photons`` onto the GPU, replicating as requested.
-
- Args:
- - photons: chroma.Event.Photons
- Photon state information to load onto GPU
- - ncopies: int, *optional*
- Number of times to replicate the photons
- on the GPU. This is used if you want
- to propagate the same event many times,
- for example in a likelihood calculation.
-
- The amount of GPU storage will be proportionally
- larger if ncopies > 1, so be careful.
- """
- nphotons = len(photons)
- self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
- self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
- self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
- self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
- self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
- self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
- self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
-
- # Assign the provided photons to the beginning (possibly
- # the entire array if ncopies is 1
- self.pos[:nphotons].set(to_float3(photons.pos))
- self.dir[:nphotons].set(to_float3(photons.dir))
- self.pol[:nphotons].set(to_float3(photons.pol))
- self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
- self.t[:nphotons].set(photons.t.astype(np.float32))
- self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
- self.flags[:nphotons].set(photons.flags.astype(np.uint32))
-
- module = get_cu_module('propagate.cu', options=cuda_options)
- self.gpu_funcs = GPUFuncs(module)
-
- # Replicate the photons to the rest of the slots if needed
- if ncopies > 1:
- max_blocks = 1024
- nthreads_per_block = 64
- for first_photon, photons_this_round, blocks in \
- chunk_iterator(nphotons, nthreads_per_block, max_blocks):
- self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
- self.pos, self.dir, self.wavelengths, self.pol, self.t,
- self.flags, self.last_hit_triangles,
- np.int32(ncopies-1),
- np.int32(nphotons),
- block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
-
- # Save the duplication information for the iterate_copies() method
- self.true_nphotons = nphotons
- self.ncopies = ncopies
-
- def get(self):
- pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
- dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
- pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
- wavelengths = self.wavelengths.get()
- t = self.t.get()
- last_hit_triangles = self.last_hit_triangles.get()
- flags = self.flags.get()
- return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
-
- def iterate_copies(self):
- '''Returns an iterator that yields GPUPhotonsSlice objects
- corresponding to the event copies stored in ``self``.'''
- for i in xrange(self.ncopies):
- window = slice(self.true_nphotons*i, self.true_nphotons*(i+1))
- yield GPUPhotonsSlice(pos=self.pos[window],
- dir=self.dir[window],
- pol=self.pol[window],
- wavelengths=self.wavelengths[window],
- t=self.t[window],
- last_hit_triangles=self.last_hit_triangles[window],
- flags=self.flags[window])
-
- @profile_if_possible
- def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
- max_blocks=1024, max_steps=10):
- """Propagate photons on GPU to termination or max_steps, whichever
- comes first.
-
- May be called repeatedly without reloading photon information if
- single-stepping through photon history.
-
- ..warning::
- `rng_states` must have at least `nthreads_per_block`*`max_blocks`
- number of curandStates.
- """
- nphotons = self.pos.size
- step = 0
- input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
- input_queue[0] = 0
- # Order photons initially in the queue to put the clones next to each other
- for copy in xrange(self.ncopies):
- input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
- input_queue_gpu = ga.to_gpu(input_queue)
- output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
- output_queue[0] = 1
- output_queue_gpu = ga.to_gpu(output_queue)
-
- while step < max_steps:
- # Just finish the rest of the steps if the # of photons is low
- if nphotons < nthreads_per_block * 16 * 8:
- nsteps = max_steps - step
- else:
- nsteps = 1
-
- for first_photon, photons_this_round, blocks in \
- chunk_iterator(nphotons, nthreads_per_block, max_blocks):
- self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
- step += nsteps
-
- if step < max_steps:
- temp = input_queue_gpu
- input_queue_gpu = output_queue_gpu
- output_queue_gpu = temp
- output_queue_gpu[:1].set(np.uint32(1))
- nphotons = input_queue_gpu[:1].get()[0] - 1
-
- if ga.max(self.flags).get() & (1 << 31):
- print >>sys.stderr, "WARNING: ABORTED PHOTONS"
-
- def __del__(self):
- del self.pos
- del self.dir
- del self.pol
- del self.wavelengths
- del self.t
- del self.flags
- del self.last_hit_triangles
- # Free up GPU memory quickly if now available
- gc.collect()
-
-class GPUPhotonsSlice(GPUPhotons):
- '''A `slice`-like view of a subrange of another GPU photons array.
- Works exactly like an instance of GPUPhotons, but the GPU storage
- is taken from another GPUPhotons instance.
-
- Returned by the GPUPhotons.iterate_copies() iterator.'''
- def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
- flags):
- '''Create new object using slices of GPUArrays from an instance
- of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!'''
- self.pos = pos
- self.dir = dir
- self.pol = pol
- self.wavelengths = wavelengths
- self.t = t
- self.last_hit_triangles = last_hit_triangles
- self.flags = flags
-
- module = get_cu_module('propagate.cu', options=cuda_options)
- self.gpu_funcs = GPUFuncs(module)
-
- self.true_nphotons = len(pos)
- self.ncopies = 1
-
- def __del__(self):
- pass # Do nothing, because we don't own any of our GPU memory
-
-class GPUChannels(object):
- def __init__(self, t, q, flags):
- self.t = t
- self.q = q
- self.flags = flags
-
- def get(self):
- t = self.t.get()
- q = self.q.get().astype(np.float32)
-
- # For now, assume all channels with small
- # enough hit time were hit.
- return event.Channels(t<1e8, t, q, self.flags.get())
-
-class GPURays(object):
- """The GPURays class holds arrays of ray positions and directions
- on the GPU that are used to render a geometry."""
- def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
- self.pos = ga.to_gpu(to_float3(pos))
- self.dir = ga.to_gpu(to_float3(dir))
-
- self.max_alpha_depth = max_alpha_depth
-
- self.nblocks = nblocks
-
- transform_module = get_cu_module('transform.cu', options=cuda_options)
- self.transform_funcs = GPUFuncs(transform_module)
-
- render_module = get_cu_module('render.cu', options=cuda_options)
- self.render_funcs = GPUFuncs(render_module)
-
- self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
- self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
- self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
-
- def rotate(self, phi, n):
- "Rotate by an angle phi around the axis `n`."
- self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
- self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
-
- def rotate_around_point(self, phi, n, point):
- """"Rotate by an angle phi around the axis `n` passing through
- the point `point`."""
- self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
- self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
-
- def translate(self, v):
- "Translate the ray positions by the vector `v`."
- self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-
- def render(self, gpu_geometry, pixels, alpha_depth=10,
- keep_last_render=False):
- """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
- colors."""
- if not keep_last_render:
- self.dxlen.fill(0)
-
- if alpha_depth > self.max_alpha_depth:
- raise Exception('alpha_depth > max_alpha_depth')
-
- if not isinstance(pixels, ga.GPUArray):
- raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
-
- if pixels.size != self.pos.size:
- raise ValueError('`pixels`.size != number of rays')
-
- self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
-
- def snapshot(self, gpu_geometry, alpha_depth=10):
- "Render `gpu_geometry` and return a numpy array of pixel colors."
- pixels = ga.empty(self.pos.size, dtype=np.uint32)
- self.render(gpu_geometry, pixels, alpha_depth)
- return pixels.get()
-
-def make_gpu_struct(size, members):
- struct = cuda.mem_alloc(size)
-
- i = 0
- for member in members:
- if isinstance(member, ga.GPUArray):
- member = member.gpudata
-
- if isinstance(member, cuda.DeviceAllocation):
- if i % 8:
- raise Exception('cannot align 64-bit pointer. '
- 'arrange struct member variables in order of '
- 'decreasing size.')
-
- cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
- i += 8
- elif np.isscalar(member):
- cuda.memcpy_htod(int(struct)+i, member)
- i += member.nbytes
- else:
- raise TypeError('expected a GPU device pointer or scalar type.')
-
- return struct
-
-def format_size(size):
- if size < 1e3:
- return '%.1f%s' % (size, ' ')
- elif size < 1e6:
- return '%.1f%s' % (size/1e3, 'K')
- elif size < 1e9:
- return '%.1f%s' % (size/1e6, 'M')
- else:
- return '%.1f%s' % (size/1e9, 'G')
-
-def format_array(name, array):
- return '%-15s %6s %6s' % \
- (name, format_size(len(array)), format_size(array.nbytes))
-
-class GPUGeometry(object):
- def __init__(self, geometry, wavelengths=None, print_usage=False):
- if wavelengths is None:
- wavelengths = standard_wavelengths
-
- try:
- wavelength_step = np.unique(np.diff(wavelengths)).item()
- except ValueError:
- raise ValueError('wavelengths must be equally spaced apart.')
-
- geometry_source = get_cu_source('geometry.h')
- material_struct_size = characterize.sizeof('Material', geometry_source)
- surface_struct_size = characterize.sizeof('Surface', geometry_source)
- geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
-
- self.material_data = []
- self.material_ptrs = []
-
- def interp_material_property(wavelengths, property):
- # note that it is essential that the material properties be
- # interpolated linearly. this fact is used in the propagation
- # code to guarantee that probabilities still sum to one.
- return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
-
- for i in range(len(geometry.unique_materials)):
- material = geometry.unique_materials[i]
-
- if material is None:
- raise Exception('one or more triangles is missing a material.')
-
- refractive_index = interp_material_property(wavelengths, material.refractive_index)
- refractive_index_gpu = ga.to_gpu(refractive_index)
- absorption_length = interp_material_property(wavelengths, material.absorption_length)
- absorption_length_gpu = ga.to_gpu(absorption_length)
- scattering_length = interp_material_property(wavelengths, material.scattering_length)
- scattering_length_gpu = ga.to_gpu(scattering_length)
-
- self.material_data.append(refractive_index_gpu)
- self.material_data.append(absorption_length_gpu)
- self.material_data.append(scattering_length_gpu)
-
- material_gpu = \
- make_gpu_struct(material_struct_size,
- [refractive_index_gpu, absorption_length_gpu,
- scattering_length_gpu,
- np.uint32(len(wavelengths)),
- np.float32(wavelength_step),
- np.float32(wavelengths[0])])
-
- self.material_ptrs.append(material_gpu)
-
- self.material_pointer_array = \
- make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
-
- self.surface_data = []
- self.surface_ptrs = []
-
- for i in range(len(geometry.unique_surfaces)):
- surface = geometry.unique_surfaces[i]
-
- if surface is None:
- # need something to copy to the surface array struct
- # that is the same size as a 64-bit pointer.
- # this pointer will never be used by the simulation.
- self.surface_ptrs.append(np.uint64(0))
- continue
-
- detect = interp_material_property(wavelengths, surface.detect)
- detect_gpu = ga.to_gpu(detect)
- absorb = interp_material_property(wavelengths, surface.absorb)
- absorb_gpu = ga.to_gpu(absorb)
- reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
- reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
- reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
- reflect_specular_gpu = ga.to_gpu(reflect_specular)
-
- self.surface_data.append(detect_gpu)
- self.surface_data.append(absorb_gpu)
- self.surface_data.append(reflect_diffuse_gpu)
- self.surface_data.append(reflect_specular_gpu)
-
- surface_gpu = \
- make_gpu_struct(surface_struct_size,
- [detect_gpu, absorb_gpu,
- reflect_diffuse_gpu,
- reflect_specular_gpu,
- np.uint32(len(wavelengths)),
- np.float32(wavelength_step),
- np.float32(wavelengths[0])])
-
- self.surface_ptrs.append(surface_gpu)
-
- self.surface_pointer_array = \
- make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
-
- self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
- self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
-
- material_codes = (((geometry.material1_index & 0xff) << 24) |
- ((geometry.material2_index & 0xff) << 16) |
- ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
-
- self.material_codes = ga.to_gpu(material_codes)
-
- self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
- self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
- self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
- self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
- self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
- self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
-
- self.gpudata = make_gpu_struct(geometry_struct_size,
- [self.vertices, self.triangles,
- self.material_codes,
- self.colors, self.lower_bounds,
- self.upper_bounds, self.node_map,
- self.node_map_end,
- self.material_pointer_array,
- self.surface_pointer_array,
- np.uint32(geometry.start_node),
- np.uint32(geometry.first_node)])
-
- self.geometry = geometry
-
- if print_usage:
- self.print_device_usage()
-
- def print_device_usage(self):
- print 'device usage:'
- print '-'*10
- print format_array('vertices', self.vertices)
- print format_array('triangles', self.triangles)
- print format_array('lower_bounds', self.lower_bounds)
- print format_array('upper_bounds', self.upper_bounds)
- print format_array('node_map', self.node_map)
- print format_array('node_map_end', self.node_map_end)
- print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
- print '-'*10
- free, total = cuda.mem_get_info()
- print '%-15s %6s %6s' % ('device total', '', format_size(total))
- print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
- print '%-15s %6s %6s' % ('device free', '', format_size(free))
- print
-
- def reset_colors(self):
- self.colors.set_async(self.geometry.colors.astype(np.uint32))
-
- def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
- max_blocks=1024):
- solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
- solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
-
- module = get_cu_module('mesh.h', options=cuda_options)
- color_solids = module.get_function('color_solids')
-
- for first_triangle, triangles_this_round, blocks in \
- chunk_iterator(self.triangles.size, nblocks_per_thread,
- max_blocks):
- color_solids(np.int32(first_triangle),
- np.int32(triangles_this_round), self.solid_id_map,
- solid_hit_gpu, solid_colors_gpu, self.gpudata,
- block=(nblocks_per_thread,1,1),
- grid=(blocks,1))
-
-class GPUDaq(object):
- def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
- self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
- self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
- self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
- self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
- self.daq_pmt_rms = pmt_rms
- self.solid_id_map_gpu = gpu_geometry.solid_id_map
-
- self.module = get_cu_module('daq.cu', options=cuda_options,
- include_source_directory=False)
- self.gpu_funcs = GPUFuncs(self.module)
-
- def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
- self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
- self.channel_q_gpu.fill(0)
- self.channel_history_gpu.fill(0)
-
- n = len(gpuphotons.pos)
-
- for first_photon, photons_this_round, blocks in \
- chunk_iterator(n, nthreads_per_block, max_blocks):
- self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
-
- self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
-
- return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
-
-class GPUPDF(object):
- def __init__(self):
- self.module = get_cu_module('daq.cu', options=cuda_options,
- include_source_directory=False)
- self.gpu_funcs = GPUFuncs(self.module)
-
- def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
- """Setup GPU arrays to hold PDF information.
-
- max_pmt_id: int, largest PMT id #
- tbins: number of time bins
- trange: tuple of (min, max) time in PDF
- qbins: number of charge bins
- qrange: tuple of (min, max) charge in PDF
- """
- self.events_in_histogram = 0
- self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
- self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins),
- dtype=np.uint32)
- self.tbins = tbins
- self.trange = trange
- self.qbins = qbins
- self.qrange = qrange
-
- def clear_pdf(self):
- """Rezero the PDF counters."""
- self.hitcount_gpu.fill(0)
- self.pdf_gpu.fill(0)
-
- def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
- self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
- gpuchannels.q,
- gpuchannels.t,
- self.hitcount_gpu,
- np.int32(self.tbins),
- np.float32(self.trange[0]),
- np.float32(self.trange[1]),
- np.int32(self.qbins),
- np.float32(self.qrange[0]),
- np.float32(self.qrange[1]),
- self.pdf_gpu,
- block=(nthreads_per_block,1,1),
- grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
-
-
- self.events_in_histogram += 1
-
- def get_pdfs(self):
- """Returns the 1D hitcount array and the 3D [channel, time, charge]
- histogram."""
- return self.hitcount_gpu.get(), self.pdf_gpu.get()
-
- def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
- trange, min_qwidth, qrange, min_bin_content=10,
- time_only=True):
- """Setup GPU arrays to compute PDF values for the given event.
- The pdf_eval calculation allows the PDF to be evaluated at a
- single point for each channel as the Monte Carlo is run. The
- effective bin size will be as small as (`min_twidth`,
- `min_qwidth`) around the point of interest, but will be large
- enough to ensure that `min_bin_content` Monte Carlo events
- fall into the bin.
-
- event_hit: ndarray
- Hit or not-hit status for each channel in the detector.
- event_time: ndarray
- Hit time for each channel in the detector. If channel
- not hit, the time will be ignored.
- event_charge: ndarray
- Integrated charge for each channel in the detector.
- If channel not hit, the charge will be ignored.
-
- min_twidth: float
- Minimum bin size in the time dimension
- trange: (float, float)
- Range of time dimension in PDF
- min_qwidth: float
- Minimum bin size in charge dimension
- qrange: (float, float)
- Range of charge dimension in PDF
- min_bin_content: int
- The bin will be expanded to include at least this many events
- time_only: bool
- If True, only the time observable will be used in the PDF.
- """
- self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
- self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
- self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
-
- self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
- self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
- self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content,
- dtype=np.float32)
- self.nearest_mc_gpu.fill(1e9)
-
- self.min_twidth = min_twidth
- self.trange = trange
- self.min_qwidth = min_qwidth
- self.qrange = qrange
- self.min_bin_content = min_bin_content
- self.time_only = time_only
-
- def clear_pdf_eval(self):
- "Reset PDF evaluation counters to start accumulating new Monte Carlo."
- self.eval_hitcount_gpu.fill(0)
- self.eval_bincount_gpu.fill(0)
- self.nearest_mc_gpu.fill(1e9)
-
- def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
- "Add the most recent results of run_daq() to the PDF evaluation."
- self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
- np.int32(len(self.event_hit_gpu)),
- self.event_hit_gpu,
- self.event_time_gpu,
- self.event_charge_gpu,
- gpuchannels.t,
- gpuchannels.q,
- self.eval_hitcount_gpu,
- self.eval_bincount_gpu,
- np.float32(self.min_twidth),
- np.float32(self.trange[0]),
- np.float32(self.trange[1]),
- np.float32(self.min_qwidth),
- np.float32(self.qrange[0]),
- np.float32(self.qrange[1]),
- self.nearest_mc_gpu,
- np.int32(self.min_bin_content),
- block=(nthreads_per_block,1,1),
- grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
-
- def get_pdf_eval(self):
- evhit = self.event_hit_gpu.get().astype(bool)
- hitcount = self.eval_hitcount_gpu.get()
- bincount = self.eval_bincount_gpu.get()
-
- pdf_value = np.zeros(len(hitcount), dtype=float)
- pdf_frac_uncert = np.zeros_like(pdf_value)
-
- # PDF value for high stats bins
- high_stats = bincount >= self.min_bin_content
- if high_stats.any():
- if self.time_only:
- pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth
- else:
- assert Exception('Unimplemented 2D (time,charge) mode!')
-
- pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats])
-
- # PDF value for low stats bins
- low_stats = ~high_stats & (hitcount > 0) & evhit
-
- nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
-
- # Deal with the case where we did not even get min_bin_content events
- # in the PDF but also clamp the lower range to ensure we don't index
- # by a negative number in 2 lines
- last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
- distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
-
- if low_stats.any():
- if self.time_only:
- pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
- else:
- assert Exception('Unimplemented 2D (time,charge) mode!')
-
- pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1)
-
- # PDFs with no stats got zero by default during array creation
-
- print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
-
- return hitcount, pdf_value, pdf_value * pdf_frac_uncert
-
-def create_cuda_context(device_id=None):
- """Initialize and return a CUDA context on the specified device.
- If device_id is None, the default device is used."""
- if device_id is None:
- try:
- context = pycuda.tools.make_default_context()
- except cuda.LogicError:
- # initialize cuda
- cuda.init()
- context = pycuda.tools.make_default_context()
- else:
- try:
- device = cuda.Device(device_id)
- except cuda.LogicError:
- # initialize cuda
- cuda.init()
- device = cuda.Device(device_id)
- context = device.make_context()
-
- context.set_cache_config(cuda.func_cache.PREFER_L1)
-
- return context
diff --git a/chroma/gpu/__init__.py b/chroma/gpu/__init__.py
new file mode 100644
index 0000000..8a87fc7
--- /dev/null
+++ b/chroma/gpu/__init__.py
@@ -0,0 +1,5 @@
+from chroma.gpu.tools import *
+from chroma.gpu.geometry import *
+from chroma.gpu.photon import *
+from chroma.gpu.daq import *
+from chroma.gpu.pdf import *
diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py
new file mode 100644
index 0000000..82958dc
--- /dev/null
+++ b/chroma/gpu/daq.py
@@ -0,0 +1,49 @@
+import numpy as np
+from pycuda import gpuarray as ga
+
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
+ chunk_iterator
+from chroma import event
+
+class GPUChannels(object):
+ def __init__(self, t, q, flags):
+ self.t = t
+ self.q = q
+ self.flags = flags
+
+ def get(self):
+ t = self.t.get()
+ q = self.q.get().astype(np.float32)
+
+ # For now, assume all channels with small
+ # enough hit time were hit.
+ return event.Channels(t<1e8, t, q, self.flags.get())
+
+class GPUDaq(object):
+ def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
+ self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
+ self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
+ self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+ self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+ self.daq_pmt_rms = pmt_rms
+ self.solid_id_map_gpu = gpu_geometry.solid_id_map
+
+ self.module = get_cu_module('daq.cu', options=cuda_options,
+ include_source_directory=False)
+ self.gpu_funcs = GPUFuncs(self.module)
+
+ def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
+ self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+ self.channel_q_gpu.fill(0)
+ self.channel_history_gpu.fill(0)
+
+ n = len(gpuphotons.pos)
+
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(n, nthreads_per_block, max_blocks):
+ self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
+
+ self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+
+ return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+
diff --git a/chroma/gpu/geometry.py b/chroma/gpu/geometry.py
new file mode 100644
index 0000000..a223006
--- /dev/null
+++ b/chroma/gpu/geometry.py
@@ -0,0 +1,174 @@
+import numpy as np
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
+from pycuda import characterize
+
+from chroma.geometry import standard_wavelengths
+from chroma.gpu.tools import get_cu_module, get_cu_source, cuda_options, \
+ chunk_iterator, format_array, format_size, to_uint3, to_float3, \
+ make_gpu_struct
+
+class GPUGeometry(object):
+ def __init__(self, geometry, wavelengths=None, print_usage=False):
+ if wavelengths is None:
+ wavelengths = standard_wavelengths
+
+ try:
+ wavelength_step = np.unique(np.diff(wavelengths)).item()
+ except ValueError:
+ raise ValueError('wavelengths must be equally spaced apart.')
+
+ geometry_source = get_cu_source('geometry.h')
+ material_struct_size = characterize.sizeof('Material', geometry_source)
+ surface_struct_size = characterize.sizeof('Surface', geometry_source)
+ geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
+
+ self.material_data = []
+ self.material_ptrs = []
+
+ def interp_material_property(wavelengths, property):
+ # note that it is essential that the material properties be
+ # interpolated linearly. this fact is used in the propagation
+ # code to guarantee that probabilities still sum to one.
+ return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
+
+ for i in range(len(geometry.unique_materials)):
+ material = geometry.unique_materials[i]
+
+ if material is None:
+ raise Exception('one or more triangles is missing a material.')
+
+ refractive_index = interp_material_property(wavelengths, material.refractive_index)
+ refractive_index_gpu = ga.to_gpu(refractive_index)
+ absorption_length = interp_material_property(wavelengths, material.absorption_length)
+ absorption_length_gpu = ga.to_gpu(absorption_length)
+ scattering_length = interp_material_property(wavelengths, material.scattering_length)
+ scattering_length_gpu = ga.to_gpu(scattering_length)
+
+ self.material_data.append(refractive_index_gpu)
+ self.material_data.append(absorption_length_gpu)
+ self.material_data.append(scattering_length_gpu)
+
+ material_gpu = \
+ make_gpu_struct(material_struct_size,
+ [refractive_index_gpu, absorption_length_gpu,
+ scattering_length_gpu,
+ np.uint32(len(wavelengths)),
+ np.float32(wavelength_step),
+ np.float32(wavelengths[0])])
+
+ self.material_ptrs.append(material_gpu)
+
+ self.material_pointer_array = \
+ make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
+
+ self.surface_data = []
+ self.surface_ptrs = []
+
+ for i in range(len(geometry.unique_surfaces)):
+ surface = geometry.unique_surfaces[i]
+
+ if surface is None:
+ # need something to copy to the surface array struct
+ # that is the same size as a 64-bit pointer.
+ # this pointer will never be used by the simulation.
+ self.surface_ptrs.append(np.uint64(0))
+ continue
+
+ detect = interp_material_property(wavelengths, surface.detect)
+ detect_gpu = ga.to_gpu(detect)
+ absorb = interp_material_property(wavelengths, surface.absorb)
+ absorb_gpu = ga.to_gpu(absorb)
+ reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
+ reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
+ reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
+ reflect_specular_gpu = ga.to_gpu(reflect_specular)
+
+ self.surface_data.append(detect_gpu)
+ self.surface_data.append(absorb_gpu)
+ self.surface_data.append(reflect_diffuse_gpu)
+ self.surface_data.append(reflect_specular_gpu)
+
+ surface_gpu = \
+ make_gpu_struct(surface_struct_size,
+ [detect_gpu, absorb_gpu,
+ reflect_diffuse_gpu,
+ reflect_specular_gpu,
+ np.uint32(len(wavelengths)),
+ np.float32(wavelength_step),
+ np.float32(wavelengths[0])])
+
+ self.surface_ptrs.append(surface_gpu)
+
+ self.surface_pointer_array = \
+ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
+
+ self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
+ self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
+
+ material_codes = (((geometry.material1_index & 0xff) << 24) |
+ ((geometry.material2_index & 0xff) << 16) |
+ ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
+
+ self.material_codes = ga.to_gpu(material_codes)
+
+ self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
+ self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
+ self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
+ self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
+ self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
+ self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
+
+ self.gpudata = make_gpu_struct(geometry_struct_size,
+ [self.vertices, self.triangles,
+ self.material_codes,
+ self.colors, self.lower_bounds,
+ self.upper_bounds, self.node_map,
+ self.node_map_end,
+ self.material_pointer_array,
+ self.surface_pointer_array,
+ np.uint32(geometry.start_node),
+ np.uint32(geometry.first_node)])
+
+ self.geometry = geometry
+
+ if print_usage:
+ self.print_device_usage()
+
+ def print_device_usage(self):
+ print 'device usage:'
+ print '-'*10
+ print format_array('vertices', self.vertices)
+ print format_array('triangles', self.triangles)
+ print format_array('lower_bounds', self.lower_bounds)
+ print format_array('upper_bounds', self.upper_bounds)
+ print format_array('node_map', self.node_map)
+ print format_array('node_map_end', self.node_map_end)
+ print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
+ print '-'*10
+ free, total = cuda.mem_get_info()
+ print '%-15s %6s %6s' % ('device total', '', format_size(total))
+ print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
+ print '%-15s %6s %6s' % ('device free', '', format_size(free))
+ print
+
+ def reset_colors(self):
+ self.colors.set_async(self.geometry.colors.astype(np.uint32))
+
+ def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
+ max_blocks=1024):
+ solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
+ solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
+
+ module = get_cu_module('mesh.h', options=cuda_options)
+ color_solids = module.get_function('color_solids')
+
+ for first_triangle, triangles_this_round, blocks in \
+ chunk_iterator(self.triangles.size, nblocks_per_thread,
+ max_blocks):
+ color_solids(np.int32(first_triangle),
+ np.int32(triangles_this_round), self.solid_id_map,
+ solid_hit_gpu, solid_colors_gpu, self.gpudata,
+ block=(nblocks_per_thread,1,1),
+ grid=(blocks,1))
+
diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py
new file mode 100644
index 0000000..053d5f3
--- /dev/null
+++ b/chroma/gpu/pdf.py
@@ -0,0 +1,177 @@
+import numpy as np
+from pycuda import gpuarray as ga
+
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs
+
+class GPUPDF(object):
+ def __init__(self):
+ self.module = get_cu_module('daq.cu', options=cuda_options,
+ include_source_directory=False)
+ self.gpu_funcs = GPUFuncs(self.module)
+
+ def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
+ """Setup GPU arrays to hold PDF information.
+
+ max_pmt_id: int, largest PMT id #
+ tbins: number of time bins
+ trange: tuple of (min, max) time in PDF
+ qbins: number of charge bins
+ qrange: tuple of (min, max) charge in PDF
+ """
+ self.events_in_histogram = 0
+ self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
+ self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins),
+ dtype=np.uint32)
+ self.tbins = tbins
+ self.trange = trange
+ self.qbins = qbins
+ self.qrange = qrange
+
+ def clear_pdf(self):
+ """Rezero the PDF counters."""
+ self.hitcount_gpu.fill(0)
+ self.pdf_gpu.fill(0)
+
+ def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
+ self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
+ gpuchannels.q,
+ gpuchannels.t,
+ self.hitcount_gpu,
+ np.int32(self.tbins),
+ np.float32(self.trange[0]),
+ np.float32(self.trange[1]),
+ np.int32(self.qbins),
+ np.float32(self.qrange[0]),
+ np.float32(self.qrange[1]),
+ self.pdf_gpu,
+ block=(nthreads_per_block,1,1),
+ grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+
+
+ self.events_in_histogram += 1
+
+ def get_pdfs(self):
+ """Returns the 1D hitcount array and the 3D [channel, time, charge]
+ histogram."""
+ return self.hitcount_gpu.get(), self.pdf_gpu.get()
+
+ def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
+ trange, min_qwidth, qrange, min_bin_content=10,
+ time_only=True):
+ """Setup GPU arrays to compute PDF values for the given event.
+ The pdf_eval calculation allows the PDF to be evaluated at a
+ single point for each channel as the Monte Carlo is run. The
+ effective bin size will be as small as (`min_twidth`,
+ `min_qwidth`) around the point of interest, but will be large
+ enough to ensure that `min_bin_content` Monte Carlo events
+ fall into the bin.
+
+ event_hit: ndarray
+ Hit or not-hit status for each channel in the detector.
+ event_time: ndarray
+ Hit time for each channel in the detector. If channel
+ not hit, the time will be ignored.
+ event_charge: ndarray
+ Integrated charge for each channel in the detector.
+ If channel not hit, the charge will be ignored.
+
+ min_twidth: float
+ Minimum bin size in the time dimension
+ trange: (float, float)
+ Range of time dimension in PDF
+ min_qwidth: float
+ Minimum bin size in charge dimension
+ qrange: (float, float)
+ Range of charge dimension in PDF
+ min_bin_content: int
+ The bin will be expanded to include at least this many events
+ time_only: bool
+ If True, only the time observable will be used in the PDF.
+ """
+ self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
+ self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
+ self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
+
+ self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+ self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+ self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content,
+ dtype=np.float32)
+ self.nearest_mc_gpu.fill(1e9)
+
+ self.min_twidth = min_twidth
+ self.trange = trange
+ self.min_qwidth = min_qwidth
+ self.qrange = qrange
+ self.min_bin_content = min_bin_content
+ self.time_only = time_only
+
+ def clear_pdf_eval(self):
+ "Reset PDF evaluation counters to start accumulating new Monte Carlo."
+ self.eval_hitcount_gpu.fill(0)
+ self.eval_bincount_gpu.fill(0)
+ self.nearest_mc_gpu.fill(1e9)
+
+ def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+ "Add the most recent results of run_daq() to the PDF evaluation."
+ self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
+ np.int32(len(self.event_hit_gpu)),
+ self.event_hit_gpu,
+ self.event_time_gpu,
+ self.event_charge_gpu,
+ gpuchannels.t,
+ gpuchannels.q,
+ self.eval_hitcount_gpu,
+ self.eval_bincount_gpu,
+ np.float32(self.min_twidth),
+ np.float32(self.trange[0]),
+ np.float32(self.trange[1]),
+ np.float32(self.min_qwidth),
+ np.float32(self.qrange[0]),
+ np.float32(self.qrange[1]),
+ self.nearest_mc_gpu,
+ np.int32(self.min_bin_content),
+ block=(nthreads_per_block,1,1),
+ grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+
+ def get_pdf_eval(self):
+ evhit = self.event_hit_gpu.get().astype(bool)
+ hitcount = self.eval_hitcount_gpu.get()
+ bincount = self.eval_bincount_gpu.get()
+
+ pdf_value = np.zeros(len(hitcount), dtype=float)
+ pdf_frac_uncert = np.zeros_like(pdf_value)
+
+ # PDF value for high stats bins
+ high_stats = bincount >= self.min_bin_content
+ if high_stats.any():
+ if self.time_only:
+ pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth
+ else:
+ assert Exception('Unimplemented 2D (time,charge) mode!')
+
+ pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats])
+
+ # PDF value for low stats bins
+ low_stats = ~high_stats & (hitcount > 0) & evhit
+
+ nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
+
+ # Deal with the case where we did not even get min_bin_content events
+ # in the PDF but also clamp the lower range to ensure we don't index
+ # by a negative number in 2 lines
+ last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
+ distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
+
+ if low_stats.any():
+ if self.time_only:
+ pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
+ else:
+ assert Exception('Unimplemented 2D (time,charge) mode!')
+
+ pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1)
+
+ # PDFs with no stats got zero by default during array creation
+
+ print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
+
+ return hitcount, pdf_value, pdf_value * pdf_frac_uncert
diff --git a/chroma/gpu/photon.py b/chroma/gpu/photon.py
new file mode 100644
index 0000000..2a9a1ff
--- /dev/null
+++ b/chroma/gpu/photon.py
@@ -0,0 +1,175 @@
+import numpy as np
+import sys
+import gc
+from pycuda import gpuarray as ga
+
+from chroma.tools import profile_if_possible
+from chroma import event
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
+ chunk_iterator, to_float3
+
+
+class GPUPhotons(object):
+ def __init__(self, photons, ncopies=1):
+ """Load ``photons`` onto the GPU, replicating as requested.
+
+ Args:
+ - photons: chroma.Event.Photons
+ Photon state information to load onto GPU
+ - ncopies: int, *optional*
+ Number of times to replicate the photons
+ on the GPU. This is used if you want
+ to propagate the same event many times,
+ for example in a likelihood calculation.
+
+ The amount of GPU storage will be proportionally
+ larger if ncopies > 1, so be careful.
+ """
+ nphotons = len(photons)
+ self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+ self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+ self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+ self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
+ self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
+ self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
+ self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
+
+ # Assign the provided photons to the beginning (possibly
+ # the entire array if ncopies is 1
+ self.pos[:nphotons].set(to_float3(photons.pos))
+ self.dir[:nphotons].set(to_float3(photons.dir))
+ self.pol[:nphotons].set(to_float3(photons.pol))
+ self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
+ self.t[:nphotons].set(photons.t.astype(np.float32))
+ self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
+ self.flags[:nphotons].set(photons.flags.astype(np.uint32))
+
+ module = get_cu_module('propagate.cu', options=cuda_options)
+ self.gpu_funcs = GPUFuncs(module)
+
+ # Replicate the photons to the rest of the slots if needed
+ if ncopies > 1:
+ max_blocks = 1024
+ nthreads_per_block = 64
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+ self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
+ self.pos, self.dir, self.wavelengths, self.pol, self.t,
+ self.flags, self.last_hit_triangles,
+ np.int32(ncopies-1),
+ np.int32(nphotons),
+ block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+
+ # Save the duplication information for the iterate_copies() method
+ self.true_nphotons = nphotons
+ self.ncopies = ncopies
+
+ def get(self):
+ pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
+ dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
+ pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
+ wavelengths = self.wavelengths.get()
+ t = self.t.get()
+ last_hit_triangles = self.last_hit_triangles.get()
+ flags = self.flags.get()
+ return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
+
+ def iterate_copies(self):
+ '''Returns an iterator that yields GPUPhotonsSlice objects
+ corresponding to the event copies stored in ``self``.'''
+ for i in xrange(self.ncopies):
+ window = slice(self.true_nphotons*i, self.true_nphotons*(i+1))
+ yield GPUPhotonsSlice(pos=self.pos[window],
+ dir=self.dir[window],
+ pol=self.pol[window],
+ wavelengths=self.wavelengths[window],
+ t=self.t[window],
+ last_hit_triangles=self.last_hit_triangles[window],
+ flags=self.flags[window])
+
+ @profile_if_possible
+ def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
+ max_blocks=1024, max_steps=10):
+ """Propagate photons on GPU to termination or max_steps, whichever
+ comes first.
+
+ May be called repeatedly without reloading photon information if
+ single-stepping through photon history.
+
+ ..warning::
+ `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+ number of curandStates.
+ """
+ nphotons = self.pos.size
+ step = 0
+ input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
+ input_queue[0] = 0
+ # Order photons initially in the queue to put the clones next to each other
+ for copy in xrange(self.ncopies):
+ input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
+ input_queue_gpu = ga.to_gpu(input_queue)
+ output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+ output_queue[0] = 1
+ output_queue_gpu = ga.to_gpu(output_queue)
+
+ while step < max_steps:
+ # Just finish the rest of the steps if the # of photons is low
+ if nphotons < nthreads_per_block * 16 * 8:
+ nsteps = max_steps - step
+ else:
+ nsteps = 1
+
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+ self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+ step += nsteps
+
+ if step < max_steps:
+ temp = input_queue_gpu
+ input_queue_gpu = output_queue_gpu
+ output_queue_gpu = temp
+ output_queue_gpu[:1].set(np.uint32(1))
+ nphotons = input_queue_gpu[:1].get()[0] - 1
+
+ if ga.max(self.flags).get() & (1 << 31):
+ print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
+ def __del__(self):
+ del self.pos
+ del self.dir
+ del self.pol
+ del self.wavelengths
+ del self.t
+ del self.flags
+ del self.last_hit_triangles
+ # Free up GPU memory quickly if now available
+ gc.collect()
+
+class GPUPhotonsSlice(GPUPhotons):
+ '''A `slice`-like view of a subrange of another GPU photons array.
+ Works exactly like an instance of GPUPhotons, but the GPU storage
+ is taken from another GPUPhotons instance.
+
+ Returned by the GPUPhotons.iterate_copies() iterator.'''
+ def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
+ flags):
+ '''Create new object using slices of GPUArrays from an instance
+ of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!'''
+ self.pos = pos
+ self.dir = dir
+ self.pol = pol
+ self.wavelengths = wavelengths
+ self.t = t
+ self.last_hit_triangles = last_hit_triangles
+ self.flags = flags
+
+ module = get_cu_module('propagate.cu', options=cuda_options)
+ self.gpu_funcs = GPUFuncs(module)
+
+ self.true_nphotons = len(pos)
+ self.ncopies = 1
+
+ def __del__(self):
+ pass # Do nothing, because we don't own any of our GPU memory
diff --git a/chroma/gpu/render.py b/chroma/gpu/render.py
new file mode 100644
index 0000000..e645da4
--- /dev/null
+++ b/chroma/gpu/render.py
@@ -0,0 +1,66 @@
+import numpy as np
+from pycuda import gpuarray as ga
+
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
+ to_float3
+
+class GPURays(object):
+ """The GPURays class holds arrays of ray positions and directions
+ on the GPU that are used to render a geometry."""
+ def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
+ self.pos = ga.to_gpu(to_float3(pos))
+ self.dir = ga.to_gpu(to_float3(dir))
+
+ self.max_alpha_depth = max_alpha_depth
+
+ self.nblocks = nblocks
+
+ transform_module = get_cu_module('transform.cu', options=cuda_options)
+ self.transform_funcs = GPUFuncs(transform_module)
+
+ render_module = get_cu_module('render.cu', options=cuda_options)
+ self.render_funcs = GPUFuncs(render_module)
+
+ self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
+ self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
+ self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
+
+ def rotate(self, phi, n):
+ "Rotate by an angle phi around the axis `n`."
+ self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+ def rotate_around_point(self, phi, n, point):
+ """"Rotate by an angle phi around the axis `n` passing through
+ the point `point`."""
+ self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+ def translate(self, v):
+ "Translate the ray positions by the vector `v`."
+ self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+ def render(self, gpu_geometry, pixels, alpha_depth=10,
+ keep_last_render=False):
+ """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
+ colors."""
+ if not keep_last_render:
+ self.dxlen.fill(0)
+
+ if alpha_depth > self.max_alpha_depth:
+ raise Exception('alpha_depth > max_alpha_depth')
+
+ if not isinstance(pixels, ga.GPUArray):
+ raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
+
+ if pixels.size != self.pos.size:
+ raise ValueError('`pixels`.size != number of rays')
+
+ self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+ def snapshot(self, gpu_geometry, alpha_depth=10):
+ "Render `gpu_geometry` and return a numpy array of pixel colors."
+ pixels = ga.empty(self.pos.size, dtype=np.uint32)
+ self.render(gpu_geometry, pixels, alpha_depth)
+ return pixels.get()
+
diff --git a/chroma/gpu/tools.py b/chroma/gpu/tools.py
new file mode 100644
index 0000000..a11a561
--- /dev/null
+++ b/chroma/gpu/tools.py
@@ -0,0 +1,180 @@
+import numpy as np
+import pycuda.tools
+from pycuda import characterize
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
+
+from chroma.cuda import srcdir
+
+cuda.init()
+
+# standard nvcc options
+cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
+
+@pycuda.tools.context_dependent_memoize
+def get_cu_module(name, options=None, include_source_directory=True):
+ """Returns a pycuda.compiler.SourceModule object from a CUDA source file
+ located in the chroma cuda directory at cuda/[name]."""
+ if options is None:
+ options = []
+ elif isinstance(options, tuple):
+ options = list(options)
+ else:
+ raise TypeError('`options` must be a tuple.')
+
+ if include_source_directory:
+ options += ['-I' + srcdir]
+
+ with open('%s/%s' % (srcdir, name)) as f:
+ source = f.read()
+
+ return pycuda.compiler.SourceModule(source, options=options,
+ no_extern_c=True)
+
+@pycuda.tools.memoize
+def get_cu_source(name):
+ """Get the source code for a CUDA source file located in the chroma cuda
+ directory at src/[name]."""
+ with open('%s/%s' % (srcdir, name)) as f:
+ source = f.read()
+ return source
+
+class GPUFuncs(object):
+ """Simple container class for GPU functions as attributes."""
+ def __init__(self, module):
+ self.module = module
+ self.funcs = {}
+
+ def __getattr__(self, name):
+ try:
+ return self.funcs[name]
+ except KeyError:
+ f = self.module.get_function(name)
+ self.funcs[name] = f
+ return f
+
+init_rng_src = """
+#include <curand_kernel.h>
+
+extern "C"
+{
+
+__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+{
+ int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+ if (id >= nthreads)
+ return;
+
+ curand_init(seed, id, offset, &s[id]);
+}
+
+} // extern "C"
+"""
+
+def get_rng_states(size, seed=1):
+ "Return `size` number of CUDA random number generator states."
+ rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+
+ module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
+ init_rng = module.get_function('init_rng')
+
+ init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
+
+ return rng_states
+
+def to_float3(arr):
+ "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
+ if not arr.flags['C_CONTIGUOUS']:
+ arr = np.asarray(arr, order='c')
+ return arr.astype(np.float32).view(ga.vec.float3)[:,0]
+
+def to_uint3(arr):
+ "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
+ if not arr.flags['C_CONTIGUOUS']:
+ arr = np.asarray(arr, order='c')
+ return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
+
+def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
+ """Iterator that yields tuples with the values requried to process
+ a long array in multiple kernel passes on the GPU.
+
+ Each yielded value is of the form,
+ (first_index, elements_this_iteration, nblocks_this_iteration)
+
+ Example:
+ >>> list(chunk_iterator(300, 32, 2))
+ [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+ """
+ first = 0
+ while first < nelements:
+ elements_left = nelements - first
+ blocks = int(elements_left // nthreads_per_block)
+ if elements_left % nthreads_per_block != 0:
+ blocks += 1 # Round up only if needed
+ blocks = min(max_blocks, blocks)
+ elements_this_round = min(elements_left, blocks * nthreads_per_block)
+
+ yield (first, elements_this_round, blocks)
+ first += elements_this_round
+
+def create_cuda_context(device_id=None):
+ """Initialize and return a CUDA context on the specified device.
+ If device_id is None, the default device is used."""
+ if device_id is None:
+ try:
+ context = pycuda.tools.make_default_context()
+ except cuda.LogicError:
+ # initialize cuda
+ cuda.init()
+ context = pycuda.tools.make_default_context()
+ else:
+ try:
+ device = cuda.Device(device_id)
+ except cuda.LogicError:
+ # initialize cuda
+ cuda.init()
+ device = cuda.Device(device_id)
+ context = device.make_context()
+
+ context.set_cache_config(cuda.func_cache.PREFER_L1)
+
+ return context
+
+def make_gpu_struct(size, members):
+ struct = cuda.mem_alloc(size)
+
+ i = 0
+ for member in members:
+ if isinstance(member, ga.GPUArray):
+ member = member.gpudata
+
+ if isinstance(member, cuda.DeviceAllocation):
+ if i % 8:
+ raise Exception('cannot align 64-bit pointer. '
+ 'arrange struct member variables in order of '
+ 'decreasing size.')
+
+ cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
+ i += 8
+ elif np.isscalar(member):
+ cuda.memcpy_htod(int(struct)+i, member)
+ i += member.nbytes
+ else:
+ raise TypeError('expected a GPU device pointer or scalar type.')
+
+ return struct
+
+def format_size(size):
+ if size < 1e3:
+ return '%.1f%s' % (size, ' ')
+ elif size < 1e6:
+ return '%.1f%s' % (size/1e3, 'K')
+ elif size < 1e9:
+ return '%.1f%s' % (size/1e6, 'M')
+ else:
+ return '%.1f%s' % (size/1e9, 'G')
+
+def format_array(name, array):
+ return '%-15s %6s %6s' % \
+ (name, format_size(len(array)), format_size(array.nbytes))