import numpy as np import time import pycuda.driver as cuda from pycuda.compiler import SourceModule from pycuda import gpuarray import threading import Queue import src class Job(object): def __init__(self, positions, directions, wavelengths, polarizations, times, states, last_hit_triangles, max_steps=100): if positions.dtype is not gpuarray.vec.float3: if len(positions.shape) != 2 or positions.shape[-1] != 3: raise Exception('shape mismatch') self.positions = np.empty(positions.shape[0], gpuarray.vec.float3) self.positions['x'] = positions[:,0] self.positions['y'] = positions[:,1] self.positions['z'] = positions[:,2] else: self.positions = positions if directions.dtype is not gpuarray.vec.float3: if len(directions.shape) != 2 or directions.shape[-1] != 3: raise Exception('shape mismatch') self.directions = \ np.empty(directions.shape[0], gpuarray.vec.float3) self.directions['x'] = directions[:,0] self.directions['y'] = directions[:,1] self.directions['z'] = directions[:,2] else: self.directions = directions if polarizations.dtype is not gpuarray.vec.float3: if len(polarizations.shape) != 2 or polarizations.shape[-1] != 3: raise Exception('shape mismatch') self.polarizations = \ np.empty(polarizations.shape[0], gpuarray.vec.float3) self.polarizations['x'] = polarizations[:,0] self.polarizations['y'] = polarizations[:,1] self.polarizations['z'] = polarizations[:,2] else: self.polarizations = polarizations self.wavelengths = np.asarray(wavelengths, dtype=np.float32) self.times = np.asarray(times, dtype=np.float32) self.states = np.asarray(states, dtype=np.int32) self.last_hit_triangles = \ np.asarray(last_hit_triangles, dtype=np.int32) self.max_steps = max_steps class GPUThread(threading.Thread): def __init__(self, device_id, geometry, jobs, output, nblocks=64): threading.Thread.__init__(self) self.device_id = device_id self.geometry = geometry self.jobs = jobs self.output = output self.nblocks = nblocks self._stop = threading.Event() def stop(self): self._stop.set() def stopped(self): return self._stop.is_set() def run(self): device = cuda.Device(self.device_id) context = device.make_context() module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) propagate = module.get_function('propagate') texrefs = self.geometry.load(module) while not self.stopped(): try: job = self.jobs.get(block=False, timeout=0.5) except Queue.Empty: continue positions_gpu = cuda.to_device(job.positions) directions_gpu = cuda.to_device(job.directions) polarizations_gpu = cuda.to_device(job.polarizations) wavelengths_gpu = cuda.to_device(job.wavelengths) times_gpu = cuda.to_device(job.times) states_gpu = cuda.to_device(job.states) last_hit_triangles_gpu = cuda.to_device(job.last_hit_triangles) nphotons = len(job.positions) t0 = time.time() propagate(np.int32(nphotons), positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(job.max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) cuda.Context.synchronize() elapsed = time.time() - t0 #print 'device %i; elapsed %f sec' % (self.device_id, elapsed) cuda.memcpy_dtoh(job.positions, positions_gpu) cuda.memcpy_dtoh(job.directions, directions_gpu) cuda.memcpy_dtoh(job.wavelengths, wavelengths_gpu) cuda.memcpy_dtoh(job.polarizations, polarizations_gpu) cuda.memcpy_dtoh(job.times, times_gpu) cuda.memcpy_dtoh(job.states, states_gpu) cuda.memcpy_dtoh(job.last_hit_triangles, last_hit_triangles_gpu) self.output.put(job) self.jobs.task_done() context.pop()