1 files changed, 144 insertions, 0 deletions
diff --git a/gputhread.py b/gputhread.py
new file mode 100644
index 0000000..8258f63
--- /dev/null
+++ b/gputhread.py
@@ -0,0 +1,144 @@
+import numpy as np
+import time
+import pycuda.driver as cuda
+from pycuda.compiler import SourceModule
+from pycuda import gpuarray
+import threading
+import Queue
+import src
+
+class Job(object):
+    def __init__(self, positions, directions, wavelengths, polarizations,
+                 times, states, last_hit_triangles, max_steps=100):
+        if positions.dtype is not gpuarray.vec.float3:
+            if len(positions.shape) != 2 or positions.shape[-1] != 3:
+                raise Exception('shape mismatch')
+
+            self.positions = np.empty(positions.shape[0], gpuarray.vec.float3)
+            self.positions['x'] = positions[:,0]
+            self.positions['y'] = positions[:,1]
+            self.positions['z'] = positions[:,2]
+        else:
+            self.positions = positions
+
+        if directions.dtype is not gpuarray.vec.float3:
+            if len(directions.shape) != 2 or directions.shape[-1] != 3:
+                raise Exception('shape mismatch')
+
+            self.directions = \
+                np.empty(directions.shape[0], gpuarray.vec.float3)
+            self.directions['x'] = directions[:,0]
+            self.directions['y'] = directions[:,1]
+            self.directions['z'] = directions[:,2]
+        else:
+            self.directions = directions
+
+        if polarizations.dtype is not gpuarray.vec.float3:
+            if len(polarizations.shape) != 2 or polarizations.shape[-1] != 3:
+                raise Exception('shape mismatch')
+
+            self.polarizations = \
+                np.empty(polarizations.shape[0], gpuarray.vec.float3)
+            self.polarizations['x'] = polarizations[:,0]
+            self.polarizations['y'] = polarizations[:,1]
+            self.polarizations['z'] = polarizations[:,2]
+        else:
+            self.polarizations = polarizations
+
+        self.wavelengths = np.asarray(wavelengths, dtype=np.float32)
+        self.times = np.asarray(times, dtype=np.float32)
+        self.states = np.asarray(states, dtype=np.int32)
+        self.last_hit_triangles = \
+            np.asarray(last_hit_triangles, dtype=np.int32)
+
+        self.max_steps = max_steps
+
+class GPUThread(threading.Thread):
+    def __init__(self, device_id, geometry, jobs, output, nblocks=64):
+        threading.Thread.__init__(self)
+
+        self.device_id = device_id
+        self.geometry = geometry
+        self.jobs = jobs
+        self.output = output
+        self.nblocks = nblocks
+        self._stop = threading.Event()
+
+    def stop(self):
+        self._stop.set()
+
+    def stopped(self):
+        return self._stop.is_set()
+
+    def run(self):
+        device = cuda.Device(self.device_id)
+        context = device.make_context()
+        module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
+        propagate = module.get_function('propagate')
+        init_rng = module.get_function('init_rng')
+        texrefs = self.geometry.load(module)
+        
+
+        daq_module = SourceModule(src.daq, options=['-I' + src.dir],
+                                  no_extern_c=True, cache_dir=False)
+        init_daq_rng = daq_module.get_function('init_daq_rng')
+        reset_earliest_time_int = daq_module.get_function('reset_earliest_time_int')
+        run_daq = daq_module.get_function('run_daq')
+        convert_sortable_int_to_float = daq_module.get_function('convert_sortable_int_to_float')
+
+        earliest_time_gpu = gpuarray.GPUArray(shape=(max(self.geometry.pmtids)+1,), dtype=np.float32)
+        earliest_time_int_gpu = gpuarray.GPUArray(shape=earliest_time_gpu.shape, dtype=np.uint32)
+
+        solid_map_gpu = gpuarray.to_gpu(self.geometry.solid_id.astype(np.int32))
+        
+        init_rng(np.int32(100000), np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1))
+        init_daq_rng(np.int32(100000), np.int32(10000+self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1))
+
+        while not self.stopped():
+            try:
+                job = self.jobs.get(block=False, timeout=0.01)
+            except Queue.Empty:
+                continue
+
+            positions_gpu = cuda.to_device(job.positions)
+            directions_gpu = cuda.to_device(job.directions)
+            polarizations_gpu = cuda.to_device(job.polarizations)
+            wavelengths_gpu = cuda.to_device(job.wavelengths)
+            times_gpu = cuda.to_device(job.times)
+            states_gpu = cuda.to_device(job.states)
+            last_hit_triangles_gpu = cuda.to_device(job.last_hit_triangles)
+            
+            nphotons = len(job.positions)
+
+            t0 = time.time()
+            propagate(np.int32(nphotons), positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(job.max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
+
+            reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu,
+                                    block=(self.nblocks,1,1), 
+                                    grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
+            run_daq(np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu,
+                    last_hit_triangles_gpu, solid_map_gpu,
+                    np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu,
+                    block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
+            convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu,
+                                          earliest_time_gpu,
+                                          block=(self.nblocks,1,1), 
+                                          grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
+
+            cuda.Context.synchronize()
+            elapsed = time.time() - t0
+            
+            #print 'device %i; elapsed %f sec' % (self.device_id, elapsed)
+            cuda.memcpy_dtoh(job.positions, positions_gpu)
+            cuda.memcpy_dtoh(job.directions, directions_gpu)
+            cuda.memcpy_dtoh(job.wavelengths, wavelengths_gpu)
+            cuda.memcpy_dtoh(job.polarizations, polarizations_gpu)
+            cuda.memcpy_dtoh(job.times, times_gpu)
+            cuda.memcpy_dtoh(job.states, states_gpu)
+            cuda.memcpy_dtoh(job.last_hit_triangles, last_hit_triangles_gpu)
+            job.earliest_times = earliest_time_gpu.get()
+
+            self.output.put(job)
+            self.jobs.task_done()
+
+        context.pop()