import numpy as np from copy import copy import time import pycuda.driver as cuda from pycuda.characterize import sizeof from pycuda.compiler import SourceModule from pycuda import gpuarray import threading import Queue import src class GPUThread(threading.Thread): def __init__(self, device_id, geometry, jobs, output, nblocks=64, max_nthreads=100000): threading.Thread.__init__(self) self.device_id = device_id self.geometry = copy(geometry) self.jobs = jobs self.output = output self.nblocks = nblocks self.max_nthreads = max_nthreads self._stop = threading.Event() def stop(self): self._stop.set() def stopped(self): return self._stop.is_set() def run(self): device = cuda.Device(self.device_id) context = device.make_context() module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) propagate = module.get_function('propagate') fill_uniform = module.get_function('fill_uniform') fill_uniform_sphere = module.get_function('fill_uniform_sphere') init_rng = module.get_function('init_rng') rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include ')) init_rng(np.int32(self.max_nthreads), rng_states_gpu, np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(self.max_nthreads//self.nblocks+1,1)) self.geometry.load(module) daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) reset_earliest_time_int = daq_module.get_function('reset_earliest_time_int') run_daq = daq_module.get_function('run_daq') convert_sortable_int_to_float = daq_module.get_function('convert_sortable_int_to_float') earliest_time_gpu = gpuarray.GPUArray(shape=(max(self.geometry.pmtids)+1,), dtype=np.float32) earliest_time_int_gpu = gpuarray.GPUArray(shape=earliest_time_gpu.shape, dtype=np.uint32) solid_map_gpu = gpuarray.to_gpu(self.geometry.solid_id.astype(np.int32)) while not self.stopped(): try: position, nphotons = self.jobs.get(block=False, timeout=0.1) except Queue.Empty: continue t0 = time.time() gpu_kwargs = {'block': (self.nblocks,1,1), 'grid': (nphotons//self.nblocks+1,1)} positions_gpu = gpuarray.empty(nphotons, dtype=gpuarray.vec.float3) positions_gpu.fill(position) directions_gpu = gpuarray.empty(nphotons, dtype=gpuarray.vec.float3) fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, directions_gpu, **gpu_kwargs) wavelengths_gpu = gpuarray.empty(nphotons, dtype=np.float32) fill_uniform(np.int32(nphotons), rng_states_gpu, wavelengths_gpu, np.float32(200), np.float32(800), **gpu_kwargs) polarizations_gpu = gpuarray.empty(nphotons, dtype=gpuarray.vec.float3) fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, polarizations_gpu, **gpu_kwargs) times_gpu = gpuarray.zeros(nphotons, dtype=np.float32) states_gpu = gpuarray.empty(nphotons, dtype=np.int32) states_gpu.fill(-1) last_hit_triangles_gpu = gpuarray.empty(nphotons, dtype=np.int32) last_hit_triangles_gpu.fill(-1) max_steps = 10 propagate(np.int32(nphotons), rng_states_gpu, positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(max_steps), **gpu_kwargs) reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) run_daq(rng_states_gpu, np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu, last_hit_triangles_gpu, solid_map_gpu, np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, earliest_time_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) cuda.Context.synchronize() elapsed = time.time() - t0 #print 'device %i; elapsed %f sec' % (self.device_id, elapsed) self.output.put(earliest_time_gpu.get()) self.jobs.task_done() context.pop()