summaryrefslogtreecommitdiff
path: root/gpu.py
diff options
context:
space:
mode:
Diffstat (limited to 'gpu.py')
-rw-r--r--gpu.py655
1 files changed, 338 insertions, 317 deletions
diff --git a/gpu.py b/gpu.py
index 585ab5f..721ea67 100644
--- a/gpu.py
+++ b/gpu.py
@@ -1,67 +1,215 @@
import numpy as np
import numpy.ma as ma
-from pycuda.tools import make_default_context
-from pycuda.compiler import SourceModule
-from pycuda.characterize import sizeof
-import pycuda.driver as cuda
-from pycuda import gpuarray
from copy import copy
from itertools import izip
-from chroma.tools import timeit
from os.path import dirname
+import sys
+
+import pytools
+import pycuda.tools
+from pycuda.compiler import SourceModule
+import pycuda.characterize
+import pycuda.driver as cuda
+from pycuda import gpuarray as ga
import chroma.src
+from chroma.tools import timeit
from chroma.geometry import standard_wavelengths
from chroma.color import map_to_color
-import sys
-import event
+from chroma import event
cuda.init()
+#@pycuda.tools.context_dependent_memoize
+def get_cu_module(name, options=None, include_source_directory=True):
+ """Returns a pycuda.compiler.SourceModule object from a CUDA source file
+ located in the chroma src directory at src/[name].cu."""
+ if options is None:
+ options = []
+
+ srcdir = dirname(chroma.src.__file__)
+
+ if include_source_directory:
+ options += ['-I' + srcdir]
+
+ with open('%s/%s.cu' % (srcdir, name)) as f:
+ source = f.read()
+
+ return pycuda.compiler.SourceModule(source, options=options,
+ no_extern_c=True)
+
+class GPUFuncs(object):
+ """Simple container class for GPU functions as attributes."""
+ def __init__(self, module):
+ self.module = module
+ self.funcs = {}
+
+ def __getattr__(self, name):
+ try:
+ return self.funcs[name]
+ except KeyError:
+ f = self.module.get_function(name)
+ self.funcs[name] = f
+ return f
+
+init_rng_src = """
+#include <curand_kernel.h>
+
+extern "C"
+{
+
+__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+{
+ int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+ if (id >= nthreads)
+ return;
+
+ curand_init(seed, id, offset, &s[id]);
+}
+
+} // extern "C"
+"""
+
+def get_rng_states(size, seed=1):
+ "Return `size` number of CUDA random number generator states."
+ rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+
+ module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
+ init_rng = module.get_function('init_rng')
+
+ init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1))
+
+ return rng_states
+
def to_float3(arr):
- return arr.astype(np.float32).view(gpuarray.vec.float3)[:,0]
-
-def boolean_argsort(condition):
- '''Returns two arrays of indicies indicating which elements of the
- boolean array `condition` should be exchanged so that all of the
- True entries occur before the false entries.
-
- Returns: tuple (a,b, ntrue) which give the indices for elements
- to exchange in a and b, and the number of true entries in the array
- in ntrue. This function in general requires fewer swaps than
- numpy.argsort.
- '''
-
- true_indices = condition.nonzero()[0][::-1] # reverse
- false_indices = (~condition).nonzero()[0]
-
- length = min(len(true_indices), len(false_indices))
-
- cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
- return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
-
-
-def chunk_iterator(nelements, nthreads_per_block, max_blocks):
- '''Iterator that yields tuples with the values requried to process
+ "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array."
+ if not arr.flags['C_CONTIGUOUS']:
+ arr = np.asarray(arr, order='c')
+ return arr.astype(np.float32).view(ga.vec.float3)[:,0]
+
+def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
+ """Iterator that yields tuples with the values requried to process
a long array in multiple kernel passes on the GPU.
- Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+ Each yielded value is of the form,
+ (first_index, elements_this_iteration, nblocks_this_iteration)
- >>> list(chunk_iterator(300, 32, 2))
- [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
- '''
+ Example:
+ >>> list(chunk_iterator(300, 32, 2))
+ [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+ """
first = 0
while first < nelements:
elements_left = nelements - first
blocks = int(elements_left // nthreads_per_block)
if elements_left % nthreads_per_block != 0:
- blocks += 1 #Round up only if needed
+ blocks += 1 # Round up only if needed
blocks = min(max_blocks, blocks)
elements_this_round = min(elements_left, blocks * nthreads_per_block)
yield (first, elements_this_round, blocks)
first += elements_this_round
-
+
+class GPUPhotons(object):
+ def __init__(self, photons):
+ self.pos = ga.to_gpu(to_float3(photons.pos))
+ self.dir = ga.to_gpu(to_float3(photons.dir))
+ self.pol = ga.to_gpu(to_float3(photons.pol))
+ self.wavelengths = ga.to_gpu(photons.wavelengths.astype(np.float32))
+ self.t = ga.to_gpu(photons.t.astype(np.float32))
+ self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
+ self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
+
+ def get(self):
+ pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
+ dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
+ pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
+ wavelengths = self.wavelengths.get()
+ t = self.t.get()
+ last_hit_triangles = self.last_hit_triangles.get()
+ flags = self.flags.get()
+ return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
+
+class GPUChannels(object):
+ def __init__(self, t, q, flags):
+ self.t = t
+ self.q = q
+ self.flags = flags
+
+ def get(self):
+ t = self.t.get()
+ q = self.q.get().astype(np.float32)
+
+ # For now, assume all channels with small
+ # enough hit time were hit.
+ return event.Channels(t<1e8, t, q, self.flags.get())
+
+def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10):
+ """Propagate photons on GPU to termination or max_steps, whichever
+ comes first.
+
+ May be called repeatedly without reloading photon information if
+ single-stepping through photon history.
+
+ ..warning::
+ `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+ number of curandStates.
+ """
+ nphotons = gpuphotons.pos.size
+ step = 0
+ input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+ input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+ input_queue_gpu = ga.to_gpu(input_queue)
+ output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+ output_queue[0] = 1
+ output_queue_gpu = ga.to_gpu(output_queue)
+
+ propagate = gpu.module.get_function('propagate')
+
+ while step < max_steps:
+ # Just finish the rest of the steps if the # of photons is low
+ if nphotons < nthreads_per_block * 16 * 8:
+ nsteps = max_steps - step
+ else:
+ nsteps = 1
+
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+ propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+ step += nsteps
+
+ if step < max_steps:
+ temp = input_queue_gpu
+ input_queue_gpu = output_queue_gpu
+ output_queue_gpu = temp
+ output_queue_gpu[:1].set(np.uint32(1))
+ nphotons = input_queue_gpu[:1].get()[0] - 1
+
+ if ga.max(gpuphotons.flags).get() & (1 << 31):
+ print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
+class GPURays(object):
+ def __init__(self, pos, dir, nblocks=64):
+ self.pos = ga.to_gpu(to_float3(pos))
+ self.dir = ga.to_gpu(to_float3(dir))
+
+ self.nblocks = nblocks
+
+ self.module = get_cu_module('transform')
+ self.gpu_funcs = GPUFuncs(self.module)
+
+ def rotate(self, phi, n):
+ self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+ def rotate_around_point(self, phi, n, point):
+ self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+
+ def translate(self, v):
+ self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
def format_size(size):
if size < 1e3:
@@ -77,317 +225,165 @@ def format_array(name, array):
return '%-15s %6s %6s' % \
(name, format_size(len(array)), format_size(array.nbytes))
-class CUDAFuncs(object):
- '''simple container class for GPU functions as attributes'''
- def __init__(self, cuda_module, func_names):
- '''Extract __global__ functions listed in func_names from
- the PyCUDA module object. The functions are assigned
- to attributes of this object with the same name.'''
- for name in func_names:
- setattr(self, name, cuda_module.get_function(name))
-
-class GPU(object):
- def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024):
- '''Initialize a GPU context on the specified device. If device_id==None,
- the default device is used.'''
-
- if device_id is None:
- self.context = make_default_context()
- else:
- device = cuda.Device(device_id)
- self.context = device.make_context()
-
- print 'device %s' % self.context.get_device().name()
-
- self.nthreads_per_block = nthreads_per_block
- self.max_blocks = max_blocks
-
- self.context.set_cache_config(cuda.func_cache.PREFER_L1)
-
- cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v']
-
- self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True)
- self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'distance_to_mesh', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng'])
-
- self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
-
- self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
- self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True)
- self.daq_funcs = CUDAFuncs(self.daq_module,
- ['reset_earliest_time_int', 'run_daq',
- 'convert_sortable_int_to_float', 'bin_hits',
- 'accumulate_pdf_eval'])
+class GPUGeometry(object):
+ def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True):
+ self.geometry = geometry
- def print_device_usage(self):
- print 'device usage:'
- print format_array('vertices', self.vertices_gpu)
- print format_array('triangles', self.triangles_gpu)
- print format_array('lower_bounds', self.lower_bounds_gpu)
- print format_array('upper_bounds', self.upper_bounds_gpu)
- print format_array('node_map', self.node_map_gpu)
- print format_array('node_map_end', self.node_map_end_gpu)
- print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+ self.module = gpu.module
+ self.gpu_funcs = GPUFuncs(gpu.module)
- def load_geometry(self, geometry, print_usage=True):
- if not hasattr(geometry, 'mesh'):
- print 'WARNING: building geometry with 8-bits'
- geometry.build(bits=8)
+ if load:
+ self.load(activate, print_usage)
- self.geo_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
+ def load(self, activate=True, print_usage=True):
+ self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
self.materials = []
- for i in range(len(geometry.unique_materials)):
- material = copy(geometry.unique_materials[i])
+ for i in range(len(self.geometry.unique_materials)):
+ material = copy(self.geometry.unique_materials[i])
if material is None:
raise Exception('one or more triangles is missing a material.')
- material.refractive_index_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
- material.absorption_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
- material.scattering_length_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
+ material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
+ material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
+ material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
- self.geo_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
+ self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
self.materials.append(material)
self.surfaces = []
- for i in range(len(geometry.unique_surfaces)):
- surface = copy(geometry.unique_surfaces[i])
+ for i in range(len(self.geometry.unique_surfaces)):
+ surface = copy(self.geometry.unique_surfaces[i])
if surface is None:
continue
- surface.detect_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
- surface.absorb_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
- surface.reflect_diffuse_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
- surface.reflect_specular_gpu = gpuarray.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
+ surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
+ surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
+ surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
+ surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
- self.geo_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
+ self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
self.surfaces.append(surface)
- self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices))
+ self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices))
triangles = \
- np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4)
- triangles['x'] = geometry.mesh.triangles[:,0]
- triangles['y'] = geometry.mesh.triangles[:,1]
- triangles['z'] = geometry.mesh.triangles[:,2]
- triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)
- self.triangles_gpu = gpuarray.to_gpu(triangles)
+ np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4)
+ triangles['x'] = self.geometry.mesh.triangles[:,0]
+ triangles['y'] = self.geometry.mesh.triangles[:,1]
+ triangles['z'] = self.geometry.mesh.triangles[:,2]
+ triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8)
+ self.triangles_gpu = ga.to_gpu(triangles)
- self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds))
+ self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds))
- self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds))
+ self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds))
- self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32))
- self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32))
- self.node_map_end_gpu = gpuarray.to_gpu(geometry.node_map_end.astype(np.uint32))
- self.solid_id_map_gpu = gpuarray.to_gpu(geometry.solid_id.astype(np.uint32))
-
- self.geo_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(geometry.node_map.size-1), np.uint32(geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
+ self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32))
+ self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32))
+ self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32))
+ self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32))
self.node_map_tex = self.module.get_texref('node_map')
self.node_map_end_tex = self.module.get_texref('node_map_end')
+ if print_usage:
+ self.print_device_usage()
+
+ if activate:
+ self.activate()
+
+ def activate(self):
+ self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
+
self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
- self.geometry = geometry
-
- if print_usage:
- print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
- print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
- print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
- self.print_device_usage()
+ def print_device_usage(self):
+ print 'device usage:'
+ print '-'*10
+ print format_array('vertices', self.vertices_gpu)
+ print format_array('triangles', self.triangles_gpu)
+ print format_array('lower_bounds', self.lower_bounds_gpu)
+ print format_array('upper_bounds', self.upper_bounds_gpu)
+ print format_array('node_map', self.node_map_gpu)
+ print format_array('node_map_end', self.node_map_end_gpu)
+ print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+ print '-'*10
+ free, total = cuda.mem_get_info()
+ print '%-15s %6s %6s' % ('device total', '', format_size(total))
+ print '%-15s %6s %6s' % ('device used', '', format_size(total-free))
+ print '%-15s %6s %6s' % ('device free', '', format_size(free))
+ print
def reset_colors(self):
self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
def color_solids(self, solid_hit, colors):
- solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool))
- solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
+ solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
+ solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
- for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks):
- self.geo_funcs.color_solids(np.int32(first_triangle),
+ for first_triangle, triangles_this_round, blocks in \
+ chunk_iterator(self.triangles_gpu.size):
+ self.gpu_funcs.color_solids(np.int32(first_triangle),
np.int32(triangles_this_round),
self.solid_id_map_gpu,
solid_hit_gpu,
solid_colors_gpu,
- block=(self.nthreads_per_block,1,1),
+ block=(64,1,1),
grid=(blocks,1))
- self.context.synchronize()
-
- def setup_propagate(self, seed=1):
- self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks
- * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
- self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu,
- np.uint64(seed), np.uint64(0),
- block=(self.nthreads_per_block,1,1),
- grid=(self.max_blocks,1))
- #self.context.synchronize()
-
- def load_photons(self, photons):
- '''Load photons onto the GPU from an event.Photons object.
-
- If photon.histories or photon.last_hit_triangles are set to none,
- they will be initialized to 0 and -1 on the GPU, respectively.
- '''
- self.nphotons = len(photons.positions)
- assert len(photons.directions) == self.nphotons
- assert len(photons.polarizations) == self.nphotons
- assert len(photons.wavelengths) == self.nphotons
-
- self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions))
- self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions))
- self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations))
- self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32))
-
- if photons.times is not None:
- self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
- else:
- self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32)
- if photons.histories is not None:
- self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32))
- else:
- self.histories_gpu = gpuarray.zeros(self.nphotons, dtype=np.uint32)
-
- if photons.last_hit_triangles is not None:
- self.last_hit_triangles_gpu = gpuarray.to_gpu(photons.last_hit_triangles.astype(np.int32))
- else:
- self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
- self.last_hit_triangles_gpu.fill(-1)
-
-
- def propagate(self, max_steps=10):
- '''Propagate photons on GPU to termination or max_steps, whichever comes first.
-
- May be called repeatedly without reloading photon information if single-stepping
- through photon history.
- '''
- nphotons = self.nphotons
- step = 0
- input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
- input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
- input_queue_gpu = gpuarray.to_gpu(input_queue)
- output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
- output_queue[0] = 1
- output_queue_gpu = gpuarray.to_gpu(output_queue)
-
-
- while step < max_steps:
- ## Just finish the rest of the steps if the # of photons is low
- if nphotons < self.nthreads_per_block * 16 * 8:
- nsteps = max_steps - step
- else:
- nsteps = 1
-
- for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks):
- self.prop_funcs.propagate(np.int32(first_photon),
- np.int32(photons_this_round),
- input_queue_gpu[1:],
- output_queue_gpu,
- self.rng_states_gpu, self.positions_gpu,
- self.directions_gpu,
- self.wavelengths_gpu, self.polarizations_gpu,
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- np.int32(nsteps),
- block=(self.nthreads_per_block,1,1),
- grid=(blocks, 1))
-
- step += nsteps
-
- if step < max_steps:
- temp = input_queue_gpu
- input_queue_gpu = output_queue_gpu
- output_queue_gpu = temp
- output_queue_gpu[:1].set(np.uint32(1))
- nphotons = input_queue_gpu[:1].get()[0] - 1
-
- if gpuarray.max(self.histories_gpu).get() & (1 << 31):
- print >>sys.stderr, "WARNING: ABORTED PHOTONS IN THIS EVENT"
-
- if 'profile' in __builtins__:
- self.context.synchronize()
-
- def get_photons(self):
- '''Returns a dictionary of current photon state information.
-
- Contents of dictionary have the same names as the parameters to load_photons().
- '''
- return event.Photons(positions=self.positions_gpu.get().view(np.float32).reshape(self.positions_gpu.size, 3),
- directions=self.directions_gpu.get().view(np.float32).reshape(self.directions_gpu.size, 3),
- polarizations=self.polarizations_gpu.get().view(np.float32).reshape(self.polarizations_gpu.size, 3),
- wavelengths=self.wavelengths_gpu.get(),
- times=self.times_gpu.get(),
- histories=self.histories_gpu.get(),
- last_hit_triangles=self.last_hit_triangles_gpu.get())
-
- def setup_daq(self, max_pmt_id, pmt_rms=1.2e-9):
- self.earliest_time_gpu = gpuarray.GPUArray(shape=(max_pmt_id+1,), dtype=np.float32)
- self.earliest_time_int_gpu = gpuarray.GPUArray(shape=self.earliest_time_gpu.shape,
- dtype=np.uint32)
- self.channel_history_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu)
- self.channel_q_gpu = gpuarray.zeros_like(self.earliest_time_int_gpu)
+class GPUDaq(object):
+ def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
+ self.earliest_time_gpu = ga.empty(max_pmt_id+1, dtype=np.float32)
+ self.earliest_time_int_gpu = ga.empty(max_pmt_id+1, dtype=np.uint32)
+ self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
+ self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
self.daq_pmt_rms = pmt_rms
+ self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu
+
+ self.module = get_cu_module('daq', include_source_directory=False)
+ self.gpu_funcs = GPUFuncs(self.module)
- def run_daq(self):
- self.daq_funcs.reset_earliest_time_int(np.float32(1e9),
- np.int32(len(self.earliest_time_int_gpu)),
- self.earliest_time_int_gpu,
- block=(self.nthreads_per_block,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
+ def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
+ self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
self.channel_q_gpu.fill(0)
self.channel_history_gpu.fill(0)
- #self.context.synchronize()
- for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks):
- self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2),
- np.float32(self.daq_pmt_rms),
- np.int32(first_photon),
- np.int32(photons_this_round),
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- self.solid_id_map_gpu,
- np.int32(len(self.earliest_time_int_gpu)),
- self.earliest_time_int_gpu,
- self.channel_q_gpu,
- self.channel_history_gpu,
- block=(self.nthreads_per_block,1,1), grid=(blocks,1))
- #self.context.synchronize()
- self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)),
- self.earliest_time_int_gpu,
- self.earliest_time_gpu,
- block=(self.nthreads_per_block,1,1),
- grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
- if 'profile' in __builtins__:
- self.context.synchronize()
-
+ n = len(gpuphotons.pos)
- def get_hits(self):
- t = self.earliest_time_gpu.get()
- # For now, assume all channels with small enough hit time were hit
- return event.Channels(hit=t<1e8, t=t,
- q=self.channel_q_gpu.get().astype(np.float32),
- histories=self.channel_history_gpu.get())
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(n, nthreads_per_block, max_blocks):
+ self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.float32(self.daq_pmt_rms), np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, self.solid_id_map_gpu, np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.channel_q_gpu, self.channel_history_gpu, block=(nthreads_per_block,1,1), grid=(blocks,1))
+
+ self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+
+ return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+
+class GPUPDF(object):
+ def __init__(self):
+ self.module = get_cu_module('daq')
+ self.gpu_funcs = GPUFuncs(self.module)
def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
- '''Setup GPU arrays to hold PDF information.
+ """Setup GPU arrays to hold PDF information.
max_pmt_id: int, largest PMT id #
tbins: number of time bins
trange: tuple of (min, max) time in PDF
qbins: number of charge bins
qrange: tuple of (min, max) charge in PDF
- '''
+ """
self.events_in_histogram = 0
- self.hitcount_gpu = gpuarray.zeros(max_pmt_id+1, dtype=np.uint32)
- self.pdf_gpu = gpuarray.zeros(shape=(max_pmt_id+1, tbins, qbins),
+ self.hitcount_gpu = ga.zeros(max_pmt_id+1, dtype=np.uint32)
+ self.pdf_gpu = ga.zeros(shape=(max_pmt_id+1, tbins, qbins),
dtype=np.uint32)
self.tbins = tbins
self.trange = trange
@@ -395,16 +391,14 @@ class GPU(object):
self.qrange = qrange
def clear_pdf(self):
- '''Rezero the PDF counters.'''
+ """Rezero the PDF counters."""
self.hitcount_gpu.fill(0)
self.pdf_gpu.fill(0)
- def add_hits_to_pdf(self):
- '''Put the most recent results of run_daq() into the PDF.'''
-
- self.daq_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
- self.channel_q_gpu,
- self.earliest_time_gpu,
+ def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
+ self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
+ gpuchannels.q,
+ gpuchannels.t,
self.hitcount_gpu,
np.int32(self.tbins),
np.float32(self.trange[0]),
@@ -413,20 +407,21 @@ class GPU(object):
np.float32(self.qrange[0]),
np.float32(self.qrange[1]),
self.pdf_gpu,
- block=(self.nthreads_per_block,1,1),
- grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1))
+ block=(nthreads_per_block,1,1),
+ grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
self.events_in_histogram += 1
def get_pdfs(self):
- '''Returns the 1D hitcount array and the 3D [channel, time, charge] histogram'''
+ """Returns the 1D hitcount array and the 3D [channel, time, charge]
+ histogram."""
return self.hitcount_gpu.get(), self.pdf_gpu.get()
- def setup_pdf_eval(self, event_hit, event_time, event_charge,
- min_twidth, trange, min_qwidth, qrange, min_bin_content=10,
+ def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
+ trange, min_qwidth, qrange, min_bin_content=10,
time_only=True):
- '''Setup GPU arrays to compute PDF values for the given event.
+ """Setup GPU arrays to compute PDF values for the given event.
The pdf_eval calculation allows the PDF to be evaluated at a
single point for each channel as the Monte Carlo is run. The
effective bin size will be as small as (`min_twidth`,
@@ -455,14 +450,14 @@ class GPU(object):
The bin will be expanded to include at least this many events
time_only: bool
If True, only the time observable will be used in the PDF.
- '''
- self.event_hit_gpu = gpuarray.to_gpu(event_hit.astype(np.uint32))
- self.event_time_gpu = gpuarray.to_gpu(event_time.astype(np.float32))
- self.event_charge_gpu = gpuarray.to_gpu(event_charge.astype(np.float32))
-
- self.eval_hitcount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32)
- self.eval_bincount_gpu = gpuarray.zeros(len(event_hit), dtype=np.uint32)
- self.nearest_mc_gpu = gpuarray.empty(shape=len(event_hit) * min_bin_content,
+ """
+ self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
+ self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
+ self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
+
+ self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+ self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
+ self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content,
dtype=np.float32)
self.nearest_mc_gpu.fill(1e9)
@@ -474,21 +469,20 @@ class GPU(object):
self.time_only = time_only
def clear_pdf_eval(self):
- '''Reset PDF evaluation counters to start accumulating new Monte Carlo.'''
+ "Reset PDF evaluation counters to start accumulating new Monte Carlo."
self.eval_hitcount_gpu.fill(0)
self.eval_bincount_gpu.fill(0)
self.nearest_mc_gpu.fill(1e9)
- def accumulate_pdf_eval(self):
- '''Add the most recent results of run_daq() to the PDF evaluation.'''
-
- self.daq_funcs.accumulate_pdf_eval(np.int32(self.time_only),
+ def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+ "Add the most recent results of run_daq() to the PDF evaluation."
+ self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
np.int32(len(self.event_hit_gpu)),
self.event_hit_gpu,
self.event_time_gpu,
self.event_charge_gpu,
- self.earliest_time_gpu,
- self.channel_q_gpu,
+ gpuchannels.t,
+ gpuchannels.q,
self.eval_hitcount_gpu,
self.eval_bincount_gpu,
np.float32(self.min_twidth),
@@ -499,8 +493,8 @@ class GPU(object):
np.float32(self.qrange[1]),
self.nearest_mc_gpu,
np.int32(self.min_bin_content),
- block=(self.nthreads_per_block,1,1),
- grid=(len(self.earliest_time_gpu)//self.nthreads_per_block+1,1))
+ block=(nthreads_per_block,1,1),
+ grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
def get_pdf_eval(self):
evhit = self.event_hit_gpu.get().astype(bool)
@@ -525,8 +519,9 @@ class GPU(object):
nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
- # Deal with the case where we did not even get min_bin_content events in the PDF
- # but also clamp the lower range to ensure we don't index by a negative number in 2 lines
+ # Deal with the case where we did not even get min_bin_content events
+ # in the PDF but also clamp the lower range to ensure we don't index
+ # by a negative number in 2 lines
last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
@@ -544,5 +539,31 @@ class GPU(object):
return hitcount, pdf_value, pdf_value * pdf_frac_uncert
+class GPU(object):
+ def __init__(self, device_id=None):
+ """Initialize a GPU context on the specified device.
+ If device_id is None, the default device is used."""
+
+ if device_id is None:
+ self.context = pycuda.tools.make_default_context()
+ else:
+ device = cuda.Device(device_id)
+ self.context = device.make_context()
+
+ print 'device %s' % self.context.get_device().name()
+
+ self.print_mem_info()
+
+ self.context.set_cache_config(cuda.func_cache.PREFER_L1)
+
+ cuda_options = ['--use_fast_math']#, '--ptxas-options=-v']
+
+ self.module = get_cu_module('kernel', options=cuda_options)
+
+ def print_mem_info(self):
+ free, total = cuda.mem_get_info()
+
+ print '%.1f %% of device memory is free.' % ((free/float(total))*100)
+
def __del__(self):
self.context.pop()