summaryrefslogtreecommitdiff
path: root/gpu.py
diff options
context:
space:
mode:
Diffstat (limited to 'gpu.py')
-rw-r--r--gpu.py413
1 files changed, 273 insertions, 140 deletions
diff --git a/gpu.py b/gpu.py
index 721ea67..de99e0a 100644
--- a/gpu.py
+++ b/gpu.py
@@ -8,7 +8,7 @@ import sys
import pytools
import pycuda.tools
from pycuda.compiler import SourceModule
-import pycuda.characterize
+from pycuda import characterize
import pycuda.driver as cuda
from pycuda import gpuarray as ga
@@ -20,24 +20,37 @@ from chroma import event
cuda.init()
-#@pycuda.tools.context_dependent_memoize
+# standard nvcc options
+cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v']
+
+@pycuda.tools.context_dependent_memoize
def get_cu_module(name, options=None, include_source_directory=True):
"""Returns a pycuda.compiler.SourceModule object from a CUDA source file
located in the chroma src directory at src/[name].cu."""
if options is None:
options = []
+ elif isinstance(options, tuple):
+ options = list(options)
+ else:
+ raise TypeError('`options` must be a tuple.')
srcdir = dirname(chroma.src.__file__)
if include_source_directory:
options += ['-I' + srcdir]
- with open('%s/%s.cu' % (srcdir, name)) as f:
+ with open('%s/%s' % (srcdir, name)) as f:
source = f.read()
return pycuda.compiler.SourceModule(source, options=options,
no_extern_c=True)
+def get_cu_source(name):
+ srcdir = dirname(chroma.src.__file__)
+ with open('%s/%s' % (srcdir, name)) as f:
+ source = f.read()
+ return source
+
class GPUFuncs(object):
"""Simple container class for GPU functions as attributes."""
def __init__(self, module):
@@ -73,7 +86,7 @@ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed,
def get_rng_states(size, seed=1):
"Return `size` number of CUDA random number generator states."
- rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+ rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True)
init_rng = module.get_function('init_rng')
@@ -88,6 +101,12 @@ def to_float3(arr):
arr = np.asarray(arr, order='c')
return arr.astype(np.float32).view(ga.vec.float3)[:,0]
+def to_uint3(arr):
+ "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array."
+ if not arr.flags['C_CONTIGUOUS']:
+ arr = np.asarray(arr, order='c')
+ return arr.astype(np.uint32).view(ga.vec.uint3)[:,0]
+
def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
"""Iterator that yields tuples with the values requried to process
a long array in multiple kernel passes on the GPU.
@@ -121,6 +140,11 @@ class GPUPhotons(object):
self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
+ #cuda_options = ('--use_fast_math', '-w')#, '--ptxas-options=-v']
+
+ module = get_cu_module('propagate.cu', options=cuda_options)
+ self.gpu_funcs = GPUFuncs(module)
+
def get(self):
pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
@@ -131,6 +155,50 @@ class GPUPhotons(object):
flags = self.flags.get()
return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
+ def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
+ max_blocks=1024, max_steps=10):
+ """Propagate photons on GPU to termination or max_steps, whichever
+ comes first.
+
+ May be called repeatedly without reloading photon information if
+ single-stepping through photon history.
+
+ ..warning::
+ `rng_states` must have at least `nthreads_per_block`*`max_blocks`
+ number of curandStates.
+ """
+ nphotons = self.pos.size
+ step = 0
+ input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+ input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+ input_queue_gpu = ga.to_gpu(input_queue)
+ output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
+ output_queue[0] = 1
+ output_queue_gpu = ga.to_gpu(output_queue)
+
+ while step < max_steps:
+ # Just finish the rest of the steps if the # of photons is low
+ if nphotons < nthreads_per_block * 16 * 8:
+ nsteps = max_steps - step
+ else:
+ nsteps = 1
+
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+ self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+ step += nsteps
+
+ if step < max_steps:
+ temp = input_queue_gpu
+ input_queue_gpu = output_queue_gpu
+ output_queue_gpu = temp
+ output_queue_gpu[:1].set(np.uint32(1))
+ nphotons = input_queue_gpu[:1].get()[0] - 1
+
+ if ga.max(self.flags).get() & (1 << 31):
+ print >>sys.stderr, "WARNING: ABORTED PHOTONS"
+
class GPUChannels(object):
def __init__(self, t, q, flags):
self.t = t
@@ -145,71 +213,89 @@ class GPUChannels(object):
# enough hit time were hit.
return event.Channels(t<1e8, t, q, self.flags.get())
-def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10):
- """Propagate photons on GPU to termination or max_steps, whichever
- comes first.
-
- May be called repeatedly without reloading photon information if
- single-stepping through photon history.
-
- ..warning::
- `rng_states` must have at least `nthreads_per_block`*`max_blocks`
- number of curandStates.
- """
- nphotons = gpuphotons.pos.size
- step = 0
- input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
- input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
- input_queue_gpu = ga.to_gpu(input_queue)
- output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
- output_queue[0] = 1
- output_queue_gpu = ga.to_gpu(output_queue)
-
- propagate = gpu.module.get_function('propagate')
-
- while step < max_steps:
- # Just finish the rest of the steps if the # of photons is low
- if nphotons < nthreads_per_block * 16 * 8:
- nsteps = max_steps - step
- else:
- nsteps = 1
-
- for first_photon, photons_this_round, blocks in \
- chunk_iterator(nphotons, nthreads_per_block, max_blocks):
- propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1))
-
- step += nsteps
-
- if step < max_steps:
- temp = input_queue_gpu
- input_queue_gpu = output_queue_gpu
- output_queue_gpu = temp
- output_queue_gpu[:1].set(np.uint32(1))
- nphotons = input_queue_gpu[:1].get()[0] - 1
-
- if ga.max(gpuphotons.flags).get() & (1 << 31):
- print >>sys.stderr, "WARNING: ABORTED PHOTONS"
-
class GPURays(object):
- def __init__(self, pos, dir, nblocks=64):
+ """The GPURays class holds arrays of ray positions and directions
+ on the GPU that are used to render a geometry."""
+ def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
self.pos = ga.to_gpu(to_float3(pos))
self.dir = ga.to_gpu(to_float3(dir))
+ self.max_alpha_depth = max_alpha_depth
+
self.nblocks = nblocks
- self.module = get_cu_module('transform')
- self.gpu_funcs = GPUFuncs(self.module)
+ transform_module = get_cu_module('transform.cu', options=cuda_options)
+ self.transform_funcs = GPUFuncs(transform_module)
+
+ render_module = get_cu_module('render.cu', options=cuda_options)
+ self.render_funcs = GPUFuncs(render_module)
+
+ self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
+ self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
+ self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
def rotate(self, phi, n):
- self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
- self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+ "Rotate by an angle phi around the axis `n`."
+ self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
def rotate_around_point(self, phi, n, point):
- self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
- self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
+ """"Rotate by an angle phi around the axis `n` passing through
+ the point `point`."""
+ self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))
def translate(self, v):
- self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+ "Translate the ray positions by the vector `v`."
+ self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+ def render(self, gpu_geometry, pixels, alpha_depth=10,
+ keep_last_render=False):
+ """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
+ colors."""
+ if not keep_last_render:
+ self.dxlen.fill(0)
+
+ if alpha_depth > self.max_alpha_depth:
+ raise Exception('alpha_depth > max_alpha_depth')
+
+ if not isinstance(pixels, ga.GPUArray):
+ raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)
+
+ if pixels.size != self.pos.size:
+ raise ValueError('`pixels`.size != number of rays')
+
+ self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
+
+ def snapshot(self, gpu_geometry, alpha_depth=10):
+ "Render `gpu_geometry` and return a numpy array of pixel colors."
+ pixels = ga.empty(self.pos.size, dtype=np.uint32)
+ self.render(gpu_geometry, pixels, alpha_depth)
+ return pixels.get()
+
+def make_gpu_struct(size, members):
+ struct = cuda.mem_alloc(size)
+
+ i = 0
+ for member in members:
+ if isinstance(member, ga.GPUArray):
+ member = member.gpudata
+
+ if isinstance(member, cuda.DeviceAllocation):
+ if i % 8:
+ raise Exception('cannot align 64-bit pointer. '
+ 'arrange struct member variables in order of '
+ 'decreasing size.')
+
+ cuda.memcpy_htod(int(struct)+i, np.intp(int(member)))
+ i += 8
+ elif np.isscalar(member):
+ cuda.memcpy_htod(int(struct)+i, member)
+ i += member.nbytes
+ else:
+ raise TypeError('expected a GPU device pointer or scalar type.')
+
+ return struct
def format_size(size):
if size < 1e3:
@@ -226,96 +312,142 @@ def format_array(name, array):
(name, format_size(len(array)), format_size(array.nbytes))
class GPUGeometry(object):
- def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True):
- self.geometry = geometry
+ def __init__(self, geometry, wavelengths=None, print_usage=True):
+ if wavelengths is None:
+ wavelengths = standard_wavelengths
+
+ try:
+ wavelength_step = np.unique(np.diff(wavelengths)).item()
+ except ValueError:
+ raise ValueError('wavelengths must be equally spaced apart.')
- self.module = gpu.module
- self.gpu_funcs = GPUFuncs(gpu.module)
+ geometry_source = get_cu_source('geometry.h')
+ material_struct_size = characterize.sizeof('Material', geometry_source)
+ surface_struct_size = characterize.sizeof('Surface', geometry_source)
+ geometry_struct_size = characterize.sizeof('Geometry', geometry_source)
- if load:
- self.load(activate, print_usage)
+ self.material_data = []
+ self.material_ptrs = []
- def load(self, activate=True, print_usage=True):
- self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1))
+ def interp_material_property(wavelengths, property):
+ # note that it is essential that the material properties be
+ # interpolated linearly. this fact is used in the propagation
+ # code to guarantee that probabilities still sum to one.
+ return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32)
- self.materials = []
- for i in range(len(self.geometry.unique_materials)):
- material = copy(self.geometry.unique_materials[i])
+ for i in range(len(geometry.unique_materials)):
+ material = geometry.unique_materials[i]
if material is None:
raise Exception('one or more triangles is missing a material.')
- material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32))
- material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32))
- material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32))
+ refractive_index = interp_material_property(wavelengths, material.refractive_index)
+ refractive_index_gpu = ga.to_gpu(refractive_index)
+ absorption_length = interp_material_property(wavelengths, material.absorption_length)
+ absorption_length_gpu = ga.to_gpu(absorption_length)
+ scattering_length = interp_material_property(wavelengths, material.scattering_length)
+ scattering_length_gpu = ga.to_gpu(scattering_length)
- self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1))
+ self.material_data.append(refractive_index_gpu)
+ self.material_data.append(absorption_length_gpu)
+ self.material_data.append(scattering_length_gpu)
- self.materials.append(material)
+ material_gpu = \
+ make_gpu_struct(material_struct_size,
+ [refractive_index_gpu, absorption_length_gpu,
+ scattering_length_gpu,
+ np.uint32(len(wavelengths)),
+ np.float32(wavelength_step),
+ np.float32(wavelengths[0])])
- self.surfaces = []
- for i in range(len(self.geometry.unique_surfaces)):
- surface = copy(self.geometry.unique_surfaces[i])
-
- if surface is None:
- continue
+ self.material_ptrs.append(material_gpu)
- surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32))
- surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32))
- surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32))
- surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32))
+ self.material_pointer_array = \
+ make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs)
- self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1))
+ self.surface_data = []
+ self.surface_ptrs = []
- self.surfaces.append(surface)
+ for i in range(len(geometry.unique_surfaces)):
+ surface = geometry.unique_surfaces[i]
- self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices))
-
- triangles = \
- np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4)
- triangles['x'] = self.geometry.mesh.triangles[:,0]
- triangles['y'] = self.geometry.mesh.triangles[:,1]
- triangles['z'] = self.geometry.mesh.triangles[:,2]
- triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8)
- self.triangles_gpu = ga.to_gpu(triangles)
-
- self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds))
-
- self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds))
+ if surface is None:
+ # need something to copy to the surface array struct
+ # that is the same size as a 64-bit pointer.
+ # this pointer will never be used by the simulation.
+ self.surface_ptrs.append(np.uint64(0))
+ continue
- self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32))
- self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32))
- self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32))
- self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32))
+ detect = interp_material_property(wavelengths, surface.detect)
+ detect_gpu = ga.to_gpu(detect)
+ absorb = interp_material_property(wavelengths, surface.absorb)
+ absorb_gpu = ga.to_gpu(absorb)
+ reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse)
+ reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse)
+ reflect_specular = interp_material_property(wavelengths, surface.reflect_specular)
+ reflect_specular_gpu = ga.to_gpu(reflect_specular)
+
+ self.surface_data.append(detect_gpu)
+ self.surface_data.append(absorb_gpu)
+ self.surface_data.append(reflect_diffuse_gpu)
+ self.surface_data.append(reflect_specular_gpu)
+
+ surface_gpu = \
+ make_gpu_struct(surface_struct_size,
+ [detect_gpu, absorb_gpu,
+ reflect_diffuse_gpu,
+ reflect_specular_gpu,
+ np.uint32(len(wavelengths)),
+ np.float32(wavelength_step),
+ np.float32(wavelengths[0])])
+
+ self.surface_ptrs.append(surface_gpu)
+
+ self.surface_pointer_array = \
+ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs)
+
+ self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices))
+ self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles))
+
+ material_codes = (((geometry.material1_index & 0xff) << 24) |
+ ((geometry.material2_index & 0xff) << 16) |
+ ((geometry.surface_index & 0xff) << 8)).astype(np.uint32)
+
+ self.material_codes = ga.to_gpu(material_codes)
+
+ self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds))
+ self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds))
+ self.colors = ga.to_gpu(geometry.colors.astype(np.uint32))
+ self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32))
+ self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32))
+ self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
+
+ self.gpudata = make_gpu_struct(geometry_struct_size,
+ [self.vertices, self.triangles,
+ self.material_codes,
+ self.colors, self.lower_bounds,
+ self.upper_bounds, self.node_map,
+ self.node_map_end,
+ self.material_pointer_array,
+ self.surface_pointer_array,
+ np.uint32(geometry.start_node),
+ np.uint32(geometry.first_node)])
- self.node_map_tex = self.module.get_texref('node_map')
- self.node_map_end_tex = self.module.get_texref('node_map_end')
+ self.geometry = geometry
if print_usage:
self.print_device_usage()
- if activate:
- self.activate()
-
- def activate(self):
- self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1))
-
- self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes)
- self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes)
-
- self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
- self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
-
def print_device_usage(self):
print 'device usage:'
print '-'*10
- print format_array('vertices', self.vertices_gpu)
- print format_array('triangles', self.triangles_gpu)
- print format_array('lower_bounds', self.lower_bounds_gpu)
- print format_array('upper_bounds', self.upper_bounds_gpu)
- print format_array('node_map', self.node_map_gpu)
- print format_array('node_map_end', self.node_map_end_gpu)
- print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
+ print format_array('vertices', self.vertices)
+ print format_array('triangles', self.triangles)
+ print format_array('lower_bounds', self.lower_bounds)
+ print format_array('upper_bounds', self.upper_bounds)
+ print format_array('node_map', self.node_map)
+ print format_array('node_map_end', self.node_map_end)
+ print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes))
print '-'*10
free, total = cuda.mem_get_info()
print '%-15s %6s %6s' % ('device total', '', format_size(total))
@@ -324,21 +456,24 @@ class GPUGeometry(object):
print
def reset_colors(self):
- self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
+ self.colors.set_async(self.geometry.colors.astype(np.uint32))
- def color_solids(self, solid_hit, colors):
+ def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
+ max_blocks=1024):
solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))
+ module = get_cu_module('mesh.h', options=cuda_options)
+ color_solids = module.get_function('color_solids')
+
for first_triangle, triangles_this_round, blocks in \
- chunk_iterator(self.triangles_gpu.size):
- self.gpu_funcs.color_solids(np.int32(first_triangle),
- np.int32(triangles_this_round),
- self.solid_id_map_gpu,
- solid_hit_gpu,
- solid_colors_gpu,
- block=(64,1,1),
- grid=(blocks,1))
+ chunk_iterator(self.triangles.size, nblocks_per_thread,
+ max_blocks):
+ color_solids(np.int32(first_triangle),
+ np.int32(triangles_this_round), self.solid_id_map,
+ solid_hit_gpu, solid_colors_gpu, self.gpudata,
+ block=(nblocks_per_thread,1,1),
+ grid=(blocks,1))
class GPUDaq(object):
def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9):
@@ -347,9 +482,10 @@ class GPUDaq(object):
self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu)
self.daq_pmt_rms = pmt_rms
- self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu
+ self.solid_id_map_gpu = gpu_geometry.solid_id_map
- self.module = get_cu_module('daq', include_source_directory=False)
+ self.module = get_cu_module('daq.cu', options=cuda_options,
+ include_source_directory=False)
self.gpu_funcs = GPUFuncs(self.module)
def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
@@ -369,7 +505,8 @@ class GPUDaq(object):
class GPUPDF(object):
def __init__(self):
- self.module = get_cu_module('daq')
+ self.module = get_cu_module('daq.cu', options=cuda_options,
+ include_source_directory=False)
self.gpu_funcs = GPUFuncs(self.module)
def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange):
@@ -556,10 +693,6 @@ class GPU(object):
self.context.set_cache_config(cuda.func_cache.PREFER_L1)
- cuda_options = ['--use_fast_math']#, '--ptxas-options=-v']
-
- self.module = get_cu_module('kernel', options=cuda_options)
-
def print_mem_info(self):
free, total = cuda.mem_get_info()