1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
import numpy as np
import time
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import threading
import Queue
import src
class Job(object):
def __init__(self, positions, directions, wavelengths, polarizations,
times, states, last_hit_triangles, max_steps=100):
if positions.dtype is not gpuarray.vec.float3:
if len(positions.shape) != 2 or positions.shape[-1] != 3:
raise Exception('shape mismatch')
self.positions = np.empty(positions.shape[0], gpuarray.vec.float3)
self.positions['x'] = positions[:,0]
self.positions['y'] = positions[:,1]
self.positions['z'] = positions[:,2]
else:
self.positions = positions
if directions.dtype is not gpuarray.vec.float3:
if len(directions.shape) != 2 or directions.shape[-1] != 3:
raise Exception('shape mismatch')
self.directions = \
np.empty(directions.shape[0], gpuarray.vec.float3)
self.directions['x'] = directions[:,0]
self.directions['y'] = directions[:,1]
self.directions['z'] = directions[:,2]
else:
self.directions = directions
if polarizations.dtype is not gpuarray.vec.float3:
if len(polarizations.shape) != 2 or polarizations.shape[-1] != 3:
raise Exception('shape mismatch')
self.polarizations = \
np.empty(polarizations.shape[0], gpuarray.vec.float3)
self.polarizations['x'] = polarizations[:,0]
self.polarizations['y'] = polarizations[:,1]
self.polarizations['z'] = polarizations[:,2]
else:
self.polarizations = polarizations
self.wavelengths = np.asarray(wavelengths, dtype=np.float32)
self.times = np.asarray(times, dtype=np.float32)
self.states = np.asarray(states, dtype=np.int32)
self.last_hit_triangles = \
np.asarray(last_hit_triangles, dtype=np.int32)
self.max_steps = max_steps
class GPUThread(threading.Thread):
def __init__(self, device_id, geometry, jobs, output, nblocks=64):
threading.Thread.__init__(self)
self.device_id = device_id
self.geometry = geometry
self.jobs = jobs
self.output = output
self.nblocks = nblocks
self._stop = threading.Event()
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.is_set()
def run(self):
device = cuda.Device(self.device_id)
context = device.make_context()
module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
propagate = module.get_function('propagate')
init_rng = module.get_function('init_rng')
texrefs = self.geometry.load(module)
init_rng(np.int32(100000), np.int32(0), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1))
while not self.stopped():
try:
job = self.jobs.get(block=False, timeout=0.5)
except Queue.Empty:
continue
positions_gpu = cuda.to_device(job.positions)
directions_gpu = cuda.to_device(job.directions)
polarizations_gpu = cuda.to_device(job.polarizations)
wavelengths_gpu = cuda.to_device(job.wavelengths)
times_gpu = cuda.to_device(job.times)
states_gpu = cuda.to_device(job.states)
last_hit_triangles_gpu = cuda.to_device(job.last_hit_triangles)
nphotons = len(job.positions)
t0 = time.time()
propagate(np.int32(nphotons), positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(job.max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
cuda.Context.synchronize()
elapsed = time.time() - t0
#print 'device %i; elapsed %f sec' % (self.device_id, elapsed)
cuda.memcpy_dtoh(job.positions, positions_gpu)
cuda.memcpy_dtoh(job.directions, directions_gpu)
cuda.memcpy_dtoh(job.wavelengths, wavelengths_gpu)
cuda.memcpy_dtoh(job.polarizations, polarizations_gpu)
cuda.memcpy_dtoh(job.times, times_gpu)
cuda.memcpy_dtoh(job.states, states_gpu)
cuda.memcpy_dtoh(job.last_hit_triangles, last_hit_triangles_gpu)
self.output.put(job)
self.jobs.task_done()
context.pop()
|