diff options
Diffstat (limited to 'gpu.py')
-rw-r--r-- | gpu.py | 413 |
1 files changed, 273 insertions, 140 deletions
@@ -8,7 +8,7 @@ import sys import pytools import pycuda.tools from pycuda.compiler import SourceModule -import pycuda.characterize +from pycuda import characterize import pycuda.driver as cuda from pycuda import gpuarray as ga @@ -20,24 +20,37 @@ from chroma import event cuda.init() -#@pycuda.tools.context_dependent_memoize +# standard nvcc options +cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v'] + +@pycuda.tools.context_dependent_memoize def get_cu_module(name, options=None, include_source_directory=True): """Returns a pycuda.compiler.SourceModule object from a CUDA source file located in the chroma src directory at src/[name].cu.""" if options is None: options = [] + elif isinstance(options, tuple): + options = list(options) + else: + raise TypeError('`options` must be a tuple.') srcdir = dirname(chroma.src.__file__) if include_source_directory: options += ['-I' + srcdir] - with open('%s/%s.cu' % (srcdir, name)) as f: + with open('%s/%s' % (srcdir, name)) as f: source = f.read() return pycuda.compiler.SourceModule(source, options=options, no_extern_c=True) +def get_cu_source(name): + srcdir = dirname(chroma.src.__file__) + with open('%s/%s' % (srcdir, name)) as f: + source = f.read() + return source + class GPUFuncs(object): """Simple container class for GPU functions as attributes.""" def __init__(self, module): @@ -73,7 +86,7 @@ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, def get_rng_states(size, seed=1): "Return `size` number of CUDA random number generator states." - rng_states = cuda.mem_alloc(size*pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) + rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') @@ -88,6 +101,12 @@ def to_float3(arr): arr = np.asarray(arr, order='c') return arr.astype(np.float32).view(ga.vec.float3)[:,0] +def to_uint3(arr): + "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array." + if not arr.flags['C_CONTIGUOUS']: + arr = np.asarray(arr, order='c') + return arr.astype(np.uint32).view(ga.vec.uint3)[:,0] + def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): """Iterator that yields tuples with the values requried to process a long array in multiple kernel passes on the GPU. @@ -121,6 +140,11 @@ class GPUPhotons(object): self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32)) self.flags = ga.to_gpu(photons.flags.astype(np.uint32)) + #cuda_options = ('--use_fast_math', '-w')#, '--ptxas-options=-v'] + + module = get_cu_module('propagate.cu', options=cuda_options) + self.gpu_funcs = GPUFuncs(module) + def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) @@ -131,6 +155,50 @@ class GPUPhotons(object): flags = self.flags.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) + def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, + max_blocks=1024, max_steps=10): + """Propagate photons on GPU to termination or max_steps, whichever + comes first. + + May be called repeatedly without reloading photon information if + single-stepping through photon history. + + ..warning:: + `rng_states` must have at least `nthreads_per_block`*`max_blocks` + number of curandStates. + """ + nphotons = self.pos.size + step = 0 + input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) + input_queue[1:] = np.arange(nphotons, dtype=np.uint32) + input_queue_gpu = ga.to_gpu(input_queue) + output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) + output_queue[0] = 1 + output_queue_gpu = ga.to_gpu(output_queue) + + while step < max_steps: + # Just finish the rest of the steps if the # of photons is low + if nphotons < nthreads_per_block * 16 * 8: + nsteps = max_steps - step + else: + nsteps = 1 + + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, np.int32(nsteps), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) + + step += nsteps + + if step < max_steps: + temp = input_queue_gpu + input_queue_gpu = output_queue_gpu + output_queue_gpu = temp + output_queue_gpu[:1].set(np.uint32(1)) + nphotons = input_queue_gpu[:1].get()[0] - 1 + + if ga.max(self.flags).get() & (1 << 31): + print >>sys.stderr, "WARNING: ABORTED PHOTONS" + class GPUChannels(object): def __init__(self, t, q, flags): self.t = t @@ -145,71 +213,89 @@ class GPUChannels(object): # enough hit time were hit. return event.Channels(t<1e8, t, q, self.flags.get()) -def propagate(gpu, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10): - """Propagate photons on GPU to termination or max_steps, whichever - comes first. - - May be called repeatedly without reloading photon information if - single-stepping through photon history. - - ..warning:: - `rng_states` must have at least `nthreads_per_block`*`max_blocks` - number of curandStates. - """ - nphotons = gpuphotons.pos.size - step = 0 - input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) - input_queue[1:] = np.arange(nphotons, dtype=np.uint32) - input_queue_gpu = ga.to_gpu(input_queue) - output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) - output_queue[0] = 1 - output_queue_gpu = ga.to_gpu(output_queue) - - propagate = gpu.module.get_function('propagate') - - while step < max_steps: - # Just finish the rest of the steps if the # of photons is low - if nphotons < nthreads_per_block * 16 * 8: - nsteps = max_steps - step - else: - nsteps = 1 - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(nphotons, nthreads_per_block, max_blocks): - propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, gpuphotons.pos, gpuphotons.dir, gpuphotons.wavelengths, gpuphotons.pol, gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, np.int32(nsteps), block=(nthreads_per_block,1,1), grid=(blocks, 1)) - - step += nsteps - - if step < max_steps: - temp = input_queue_gpu - input_queue_gpu = output_queue_gpu - output_queue_gpu = temp - output_queue_gpu[:1].set(np.uint32(1)) - nphotons = input_queue_gpu[:1].get()[0] - 1 - - if ga.max(gpuphotons.flags).get() & (1 << 31): - print >>sys.stderr, "WARNING: ABORTED PHOTONS" - class GPURays(object): - def __init__(self, pos, dir, nblocks=64): + """The GPURays class holds arrays of ray positions and directions + on the GPU that are used to render a geometry.""" + def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) + self.max_alpha_depth = max_alpha_depth + self.nblocks = nblocks - self.module = get_cu_module('transform') - self.gpu_funcs = GPUFuncs(self.module) + transform_module = get_cu_module('transform.cu', options=cuda_options) + self.transform_funcs = GPUFuncs(transform_module) + + render_module = get_cu_module('render.cu', options=cuda_options) + self.render_funcs = GPUFuncs(render_module) + + self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) + self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) + self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32) def rotate(self, phi, n): - self.gpu_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) + "Rotate by an angle phi around the axis `n`." + self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) def rotate_around_point(self, phi, n, point): - self.gpu_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) - self.gpu_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) + """"Rotate by an angle phi around the axis `n` passing through + the point `point`.""" + self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1)) def translate(self, v): - self.gpu_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + "Translate the ray positions by the vector `v`." + self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + + def render(self, gpu_geometry, pixels, alpha_depth=10, + keep_last_render=False): + """Render `gpu_geometry` and fill the GPU array `pixels` with pixel + colors.""" + if not keep_last_render: + self.dxlen.fill(0) + + if alpha_depth > self.max_alpha_depth: + raise Exception('alpha_depth > max_alpha_depth') + + if not isinstance(pixels, ga.GPUArray): + raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray) + + if pixels.size != self.pos.size: + raise ValueError('`pixels`.size != number of rays') + + self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1)) + + def snapshot(self, gpu_geometry, alpha_depth=10): + "Render `gpu_geometry` and return a numpy array of pixel colors." + pixels = ga.empty(self.pos.size, dtype=np.uint32) + self.render(gpu_geometry, pixels, alpha_depth) + return pixels.get() + +def make_gpu_struct(size, members): + struct = cuda.mem_alloc(size) + + i = 0 + for member in members: + if isinstance(member, ga.GPUArray): + member = member.gpudata + + if isinstance(member, cuda.DeviceAllocation): + if i % 8: + raise Exception('cannot align 64-bit pointer. ' + 'arrange struct member variables in order of ' + 'decreasing size.') + + cuda.memcpy_htod(int(struct)+i, np.intp(int(member))) + i += 8 + elif np.isscalar(member): + cuda.memcpy_htod(int(struct)+i, member) + i += member.nbytes + else: + raise TypeError('expected a GPU device pointer or scalar type.') + + return struct def format_size(size): if size < 1e3: @@ -226,96 +312,142 @@ def format_array(name, array): (name, format_size(len(array)), format_size(array.nbytes)) class GPUGeometry(object): - def __init__(self, gpu, geometry, load=True, activate=True, print_usage=True): - self.geometry = geometry + def __init__(self, geometry, wavelengths=None, print_usage=True): + if wavelengths is None: + wavelengths = standard_wavelengths + + try: + wavelength_step = np.unique(np.diff(wavelengths)).item() + except ValueError: + raise ValueError('wavelengths must be equally spaced apart.') - self.module = gpu.module - self.gpu_funcs = GPUFuncs(gpu.module) + geometry_source = get_cu_source('geometry.h') + material_struct_size = characterize.sizeof('Material', geometry_source) + surface_struct_size = characterize.sizeof('Surface', geometry_source) + geometry_struct_size = characterize.sizeof('Geometry', geometry_source) - if load: - self.load(activate, print_usage) + self.material_data = [] + self.material_ptrs = [] - def load(self, activate=True, print_usage=True): - self.gpu_funcs.set_wavelength_range(np.float32(standard_wavelengths[0]), np.float32(standard_wavelengths[-1]), np.float32(standard_wavelengths[1]-standard_wavelengths[0]), np.uint32(standard_wavelengths.size), block=(1,1,1), grid=(1,1)) + def interp_material_property(wavelengths, property): + # note that it is essential that the material properties be + # interpolated linearly. this fact is used in the propagation + # code to guarantee that probabilities still sum to one. + return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) - self.materials = [] - for i in range(len(self.geometry.unique_materials)): - material = copy(self.geometry.unique_materials[i]) + for i in range(len(geometry.unique_materials)): + material = geometry.unique_materials[i] if material is None: raise Exception('one or more triangles is missing a material.') - material.refractive_index_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.refractive_index[:,0], material.refractive_index[:,1]).astype(np.float32)) - material.absorption_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.absorption_length[:,0], material.absorption_length[:,1]).astype(np.float32)) - material.scattering_length_gpu = ga.to_gpu(np.interp(standard_wavelengths, material.scattering_length[:,0], material.scattering_length[:,1]).astype(np.float32)) + refractive_index = interp_material_property(wavelengths, material.refractive_index) + refractive_index_gpu = ga.to_gpu(refractive_index) + absorption_length = interp_material_property(wavelengths, material.absorption_length) + absorption_length_gpu = ga.to_gpu(absorption_length) + scattering_length = interp_material_property(wavelengths, material.scattering_length) + scattering_length_gpu = ga.to_gpu(scattering_length) - self.gpu_funcs.set_material(np.int32(i), material.refractive_index_gpu, material.absorption_length_gpu, material.scattering_length_gpu, block=(1,1,1), grid=(1,1)) + self.material_data.append(refractive_index_gpu) + self.material_data.append(absorption_length_gpu) + self.material_data.append(scattering_length_gpu) - self.materials.append(material) + material_gpu = \ + make_gpu_struct(material_struct_size, + [refractive_index_gpu, absorption_length_gpu, + scattering_length_gpu, + np.uint32(len(wavelengths)), + np.float32(wavelength_step), + np.float32(wavelengths[0])]) - self.surfaces = [] - for i in range(len(self.geometry.unique_surfaces)): - surface = copy(self.geometry.unique_surfaces[i]) - - if surface is None: - continue + self.material_ptrs.append(material_gpu) - surface.detect_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.detect[:,0], surface.detect[:,1]).astype(np.float32)) - surface.absorb_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.absorb[:,0], surface.absorb[:,1]).astype(np.float32)) - surface.reflect_diffuse_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_diffuse[:,0], surface.reflect_diffuse[:,1]).astype(np.float32)) - surface.reflect_specular_gpu = ga.to_gpu(np.interp(standard_wavelengths, surface.reflect_specular[:,0], surface.reflect_specular[:,1]).astype(np.float32)) + self.material_pointer_array = \ + make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) - self.gpu_funcs.set_surface(np.int32(i), surface.detect_gpu, surface.absorb_gpu, surface.reflect_diffuse_gpu, surface.reflect_specular_gpu, block=(1,1,1), grid=(1,1)) + self.surface_data = [] + self.surface_ptrs = [] - self.surfaces.append(surface) + for i in range(len(geometry.unique_surfaces)): + surface = geometry.unique_surfaces[i] - self.vertices_gpu = ga.to_gpu(to_float3(self.geometry.mesh.vertices)) - - triangles = \ - np.empty(len(self.geometry.mesh.triangles), dtype=ga.vec.uint4) - triangles['x'] = self.geometry.mesh.triangles[:,0] - triangles['y'] = self.geometry.mesh.triangles[:,1] - triangles['z'] = self.geometry.mesh.triangles[:,2] - triangles['w'] = ((self.geometry.material1_index & 0xff) << 24) | ((self.geometry.material2_index & 0xff) << 16) | ((self.geometry.surface_index & 0xff) << 8) - self.triangles_gpu = ga.to_gpu(triangles) - - self.lower_bounds_gpu = ga.to_gpu(to_float3(self.geometry.lower_bounds)) - - self.upper_bounds_gpu = ga.to_gpu(to_float3(self.geometry.upper_bounds)) + if surface is None: + # need something to copy to the surface array struct + # that is the same size as a 64-bit pointer. + # this pointer will never be used by the simulation. + self.surface_ptrs.append(np.uint64(0)) + continue - self.colors_gpu = ga.to_gpu(self.geometry.colors.astype(np.uint32)) - self.node_map_gpu = ga.to_gpu(self.geometry.node_map.astype(np.uint32)) - self.node_map_end_gpu = ga.to_gpu(self.geometry.node_map_end.astype(np.uint32)) - self.solid_id_map_gpu = ga.to_gpu(self.geometry.solid_id.astype(np.uint32)) + detect = interp_material_property(wavelengths, surface.detect) + detect_gpu = ga.to_gpu(detect) + absorb = interp_material_property(wavelengths, surface.absorb) + absorb_gpu = ga.to_gpu(absorb) + reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) + reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) + reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) + reflect_specular_gpu = ga.to_gpu(reflect_specular) + + self.surface_data.append(detect_gpu) + self.surface_data.append(absorb_gpu) + self.surface_data.append(reflect_diffuse_gpu) + self.surface_data.append(reflect_specular_gpu) + + surface_gpu = \ + make_gpu_struct(surface_struct_size, + [detect_gpu, absorb_gpu, + reflect_diffuse_gpu, + reflect_specular_gpu, + np.uint32(len(wavelengths)), + np.float32(wavelength_step), + np.float32(wavelengths[0])]) + + self.surface_ptrs.append(surface_gpu) + + self.surface_pointer_array = \ + make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) + + self.vertices = ga.to_gpu(to_float3(geometry.mesh.vertices)) + self.triangles = ga.to_gpu(to_uint3(geometry.mesh.triangles)) + + material_codes = (((geometry.material1_index & 0xff) << 24) | + ((geometry.material2_index & 0xff) << 16) | + ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) + + self.material_codes = ga.to_gpu(material_codes) + + self.lower_bounds = ga.to_gpu(to_float3(geometry.lower_bounds)) + self.upper_bounds = ga.to_gpu(to_float3(geometry.upper_bounds)) + self.colors = ga.to_gpu(geometry.colors.astype(np.uint32)) + self.node_map = ga.to_gpu(geometry.node_map.astype(np.uint32)) + self.node_map_end = ga.to_gpu(geometry.node_map_end.astype(np.uint32)) + self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) + + self.gpudata = make_gpu_struct(geometry_struct_size, + [self.vertices, self.triangles, + self.material_codes, + self.colors, self.lower_bounds, + self.upper_bounds, self.node_map, + self.node_map_end, + self.material_pointer_array, + self.surface_pointer_array, + np.uint32(geometry.start_node), + np.uint32(geometry.first_node)]) - self.node_map_tex = self.module.get_texref('node_map') - self.node_map_end_tex = self.module.get_texref('node_map_end') + self.geometry = geometry if print_usage: self.print_device_usage() - if activate: - self.activate() - - def activate(self): - self.gpu_funcs.set_global_mesh_variables(self.triangles_gpu, self.vertices_gpu, self.colors_gpu, np.uint32(self.geometry.node_map.size-1), np.uint32(self.geometry.first_node), self.lower_bounds_gpu, self.upper_bounds_gpu, block=(1,1,1), grid=(1,1)) - - self.node_map_tex.set_address(self.node_map_gpu.gpudata, self.node_map_gpu.nbytes) - self.node_map_end_tex.set_address(self.node_map_end_gpu.gpudata, self.node_map_end_gpu.nbytes) - - self.node_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1) - self.node_map_end_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1) - def print_device_usage(self): print 'device usage:' print '-'*10 - print format_array('vertices', self.vertices_gpu) - print format_array('triangles', self.triangles_gpu) - print format_array('lower_bounds', self.lower_bounds_gpu) - print format_array('upper_bounds', self.upper_bounds_gpu) - print format_array('node_map', self.node_map_gpu) - print format_array('node_map_end', self.node_map_end_gpu) - print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes)) + print format_array('vertices', self.vertices) + print format_array('triangles', self.triangles) + print format_array('lower_bounds', self.lower_bounds) + print format_array('upper_bounds', self.upper_bounds) + print format_array('node_map', self.node_map) + print format_array('node_map_end', self.node_map_end) + print '%-15s %6s %6s' % ('total', '', format_size(self.vertices.nbytes + self.triangles.nbytes + self.lower_bounds.nbytes + self.upper_bounds.nbytes + self.node_map.nbytes + self.node_map_end.nbytes)) print '-'*10 free, total = cuda.mem_get_info() print '%-15s %6s %6s' % ('device total', '', format_size(total)) @@ -324,21 +456,24 @@ class GPUGeometry(object): print def reset_colors(self): - self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32)) + self.colors.set_async(self.geometry.colors.astype(np.uint32)) - def color_solids(self, solid_hit, colors): + def color_solids(self, solid_hit, colors, nblocks_per_thread=64, + max_blocks=1024): solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool)) solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32)) + module = get_cu_module('mesh.h', options=cuda_options) + color_solids = module.get_function('color_solids') + for first_triangle, triangles_this_round, blocks in \ - chunk_iterator(self.triangles_gpu.size): - self.gpu_funcs.color_solids(np.int32(first_triangle), - np.int32(triangles_this_round), - self.solid_id_map_gpu, - solid_hit_gpu, - solid_colors_gpu, - block=(64,1,1), - grid=(blocks,1)) + chunk_iterator(self.triangles.size, nblocks_per_thread, + max_blocks): + color_solids(np.int32(first_triangle), + np.int32(triangles_this_round), self.solid_id_map, + solid_hit_gpu, solid_colors_gpu, self.gpudata, + block=(nblocks_per_thread,1,1), + grid=(blocks,1)) class GPUDaq(object): def __init__(self, gpu_geometry, max_pmt_id, pmt_rms=1.2e-9): @@ -347,9 +482,10 @@ class GPUDaq(object): self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.daq_pmt_rms = pmt_rms - self.solid_id_map_gpu = gpu_geometry.solid_id_map_gpu + self.solid_id_map_gpu = gpu_geometry.solid_id_map - self.module = get_cu_module('daq', include_source_directory=False) + self.module = get_cu_module('daq.cu', options=cuda_options, + include_source_directory=False) self.gpu_funcs = GPUFuncs(self.module) def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): @@ -369,7 +505,8 @@ class GPUDaq(object): class GPUPDF(object): def __init__(self): - self.module = get_cu_module('daq') + self.module = get_cu_module('daq.cu', options=cuda_options, + include_source_directory=False) self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, max_pmt_id, tbins, trange, qbins, qrange): @@ -556,10 +693,6 @@ class GPU(object): self.context.set_cache_config(cuda.func_cache.PREFER_L1) - cuda_options = ['--use_fast_math']#, '--ptxas-options=-v'] - - self.module = get_cu_module('kernel', options=cuda_options) - def print_mem_info(self): free, total = cuda.mem_get_info() |