1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import threading
import Queue
import src
class Job(object):
def __init__(self, origins, directions):
self.origins, self.directions = origins, directions
class GPUThread(threading.Thread):
def __init__(self, device_id, geometry, jobs, output, nblocks=64):
threading.Thread.__init__(self)
self.device_id = device_id
self.geometry = geometry
self.jobs = jobs
self.output = output
self.nblocks = nblocks
self._stop = threading.Event()
def stop(self):
self._stop.set()
def stopped(self):
return self._stop.is_set()
def run(self):
device = cuda.Device(self.device_id)
context = device.make_context()
module = SourceModule(src.kernel, options=['-I' + src.dir],
no_extern_c=True, cache_dir=False)
propagate = module.get_function('propagate')
texrefs = self.geometry.load(module)
while not self.stopped():
try:
job = self.jobs.get()
except Queue.Empty:
continue
origins_gpu, directions_gpu = cuda.to_device(job.origins), \
cuda.to_device(job.directions)
dest = np.empty(job.origins.size, dtype=np.int32)
dest_gpu = cuda.to_device(dest)
propagate(np.int32(job.origins.size), origins_gpu, directions_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), dest_gpu, block=(self.nblocks,1,1), grid=(job.origins.size//self.nblocks+1,1), texrefs=texrefs)
cuda.Context.synchronize()
cuda.memcpy_dtoh(dest, dest_gpu)
triangles = dest[(dest != -1)]
bincount = np.zeros(len(self.geometry.solids))
gpu_bincount = np.bincount(self.geometry.solid_id[triangles])
bincount[:gpu_bincount.size] = gpu_bincount
self.output.put(bincount)
self.jobs.task_done()
context.pop()
|