import numpy as np from copy import copy import time import pycuda.driver as cuda from pycuda.characterize import sizeof from pycuda.compiler import SourceModule from pycuda import gpuarray import threading import Queue import src class GPUThread(threading.Thread): def __init__(self, device_id, geometry, jobs, output, nblocks=64, max_nthreads=100000): threading.Thread.__init__(self) self.device_id = device_id self.geometry = copy(geometry) self.jobs = jobs self.output = output self.nblocks = nblocks self.max_nthreads = max_nthreads self._stop = threading.Event() def stop(self): self._stop.set() def stopped(self): return self._stop.is_set() def run(self): device = cuda.Device(self.device_id) context = device.make_context() module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) propagate = module.get_function('propagate') fill_float = module.get_function('fill_float') fill_float3 = module.get_function('fill_float3') fill_uniform = module.get_function('fill_uniform') fill_uniform_sphere = module.get_function('fill_uniform_sphere') init_rng = module.get_function('init_rng') rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include ')) init_rng(np.int32(self.max_nthreads), rng_states_gpu, np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(self.max_nthreads//self.nblocks+1,1)) texrefs = self.geometry.load(module) daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) reset_earliest_time_int = daq_module.get_function('reset_earliest_time_int') run_daq = daq_module.get_function('run_daq') convert_sortable_int_to_float = daq_module.get_function('convert_sortable_int_to_float') earliest_time_gpu = gpuarray.GPUArray(shape=(max(self.geometry.pmtids)+1,), dtype=np.float32) earliest_time_int_gpu = gpuarray.GPUArray(shape=earliest_time_gpu.shape, dtype=np.uint32) solid_map_gpu = gpuarray.to_gpu(self.geometry.solid_id.astype(np.int32)) while not self.stopped(): try: position, nphotons = self.jobs.get(block=False, timeout=0.1) except Queue.Empty: continue t0 = time.time() gpu_kwargs = {'block': (self.nblocks,1,1), 'grid': (nphotons//self.nblocks+1,1)} positions_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons) fill_float3(np.int32(nphotons), positions_gpu, position, **gpu_kwargs) directions_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons) fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, directions_gpu, **gpu_kwargs) wavelengths_gpu = cuda.mem_alloc(np.dtype(np.float32).itemsize*nphotons) fill_uniform(np.int32(nphotons), rng_states_gpu, wavelengths_gpu, np.float32(200), np.float32(800), **gpu_kwargs) polarizations_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons) fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, polarizations_gpu, **gpu_kwargs) times_gpu = cuda.mem_alloc(np.dtype(np.float32).itemsize*nphotons) fill_float(np.int32(nphotons), times_gpu, np.float32(0), **gpu_kwargs) states_gpu = cuda.mem_alloc(np.dtype(np.int32).itemsize*nphotons) fill_float(np.int32(nphotons), states_gpu, np.int32(-1), **gpu_kwargs) last_hit_triangles_gpu = cuda.mem_alloc(np.dtype(np.int32).itemsize*nphotons) fill_float(np.int32(nphotons), last_hit_triangles_gpu, np.int32(-1), **gpu_kwargs) max_steps = 10 propagate(np.int32(nphotons), rng_states_gpu, positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) run_daq(rng_states_gpu, np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu, last_hit_triangles_gpu, solid_map_gpu, np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, earliest_time_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) cuda.Context.synchronize() elapsed = time.time() - t0 #print 'device %i; elapsed %f sec' % (self.device_id, elapsed) self.output.put(earliest_time_gpu.get()) self.jobs.task_done() context.pop()