summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gpu.py76
-rw-r--r--src/kernel.cu28
2 files changed, 86 insertions, 18 deletions
diff --git a/gpu.py b/gpu.py
index f0cf19e..c9e6269 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,26 @@ from color import map_to_color
cuda.init()
+def boolean_argsort(condition):
+ '''Returns two arrays of indicies indicating which elements of the
+ boolean array `condition` should be exchanged so that all of the
+ True entries occur before the false entries.
+
+ Returns: tuple (a,b, ntrue) which give the indices for elements
+ to exchange in a and b, and the number of true entries in the array
+ in ntrue. This function in general requires fewer swaps than
+ numpy.argsort.
+ '''
+
+ true_indices = condition.nonzero()[0][::-1] # reverse
+ false_indices = (~condition).nonzero()[0]
+
+ length = min(len(true_indices), len(false_indices))
+
+ cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
+ return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
+
+
def chunk_iterator(nelements, nthreads_per_block, max_blocks):
'''Iterator that yields tuples with the values requried to process
a long array in multiple kernel passes on the GPU.
@@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks):
first = 0
while first < nelements:
elements_left = nelements - first
- blocks = elements_left // nthreads_per_block
+ blocks = int(elements_left // nthreads_per_block)
if elements_left % nthreads_per_block != 0:
blocks += 1 #Round up only if needed
blocks = min(max_blocks, blocks)
@@ -80,7 +100,7 @@ class GPU(object):
self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True)
self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
- self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
+ self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
self.nthread_per_block = 64
self.max_blocks = 1024
self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True)
@@ -230,7 +250,6 @@ class GPU(object):
self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
- self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
if histories is not None:
self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -250,18 +269,45 @@ class GPU(object):
May be called repeatedly without reloading photon information if single-stepping
through photon history.
'''
- for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
- self.prop_funcs.propagate(np.int32(first_photon),
- np.int32(photons_this_round),
- self.photon_offsets_gpu,
- self.rng_states_gpu, self.positions_gpu,
- self.directions_gpu,
- self.wavelengths_gpu, self.polarizations_gpu,
- self.times_gpu, self.histories_gpu,
- self.last_hit_triangles_gpu,
- np.int32(max_steps),
- block=(self.nthread_per_block,1,1),
- grid=(blocks, 1))
+ nphotons = self.nphotons
+ step = 0
+ input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+ input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
+ input_queue_gpu = gpuarray.to_gpu(input_queue)
+ output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+ output_queue[0] = 1
+ output_queue_gpu = gpuarray.to_gpu(output_queue)
+
+
+ while step < max_steps:
+ ## Just finish the rest of the steps if the # of photons is low
+ if nphotons < self.nthread_per_block * 16 * 8:
+ nsteps = max_steps - step
+ else:
+ nsteps = 1
+
+ for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks):
+ self.prop_funcs.propagate(np.int32(first_photon),
+ np.int32(photons_this_round),
+ input_queue_gpu[1:],
+ output_queue_gpu,
+ self.rng_states_gpu, self.positions_gpu,
+ self.directions_gpu,
+ self.wavelengths_gpu, self.polarizations_gpu,
+ self.times_gpu, self.histories_gpu,
+ self.last_hit_triangles_gpu,
+ np.int32(nsteps),
+ block=(self.nthread_per_block,1,1),
+ grid=(blocks, 1))
+
+ step += nsteps
+
+ if step < max_steps:
+ temp = input_queue_gpu
+ input_queue_gpu = output_queue_gpu
+ output_queue_gpu = temp
+ output_queue_gpu[:1].set(np.uint32(1))
+ nphotons = input_queue_gpu[:1].get()[0]
if 'profile' in __builtins__:
self.context.synchronize()
diff --git a/src/kernel.cu b/src/kernel.cu
index a26460a..cc567b8 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -292,7 +292,24 @@ __global__ void ray_trace_alpha(int nthreads, float3 *positions, float3 *directi
} // ray_trace
-__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states,
+__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
+{
+ int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+ if (id < nswap) {
+ int a = offset_a[id];
+ int b = offset_b[id];
+
+ int tmp = values[a];
+ values[a] = values[b];
+ values[b] = tmp;
+ }
+}
+
+__global__ void propagate(int first_photon, int nthreads,
+ unsigned int *input_queue,
+ unsigned int *output_queue,
+ curandState *rng_states,
float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times,
unsigned int *histories, int *last_hit_triangles, int max_steps)
{
@@ -303,7 +320,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
curandState rng = rng_states[id];
- int photon_id = photon_offsets[first_photon + id];
+ int photon_id = input_queue[first_photon + id];
Photon p;
p.position = positions[photon_id];
@@ -364,7 +381,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
times[photon_id] = p.time;
histories[photon_id] = p.history;
last_hit_triangles[photon_id] = p.last_hit_triangle;
-
+
+ // Not done, put photon in output queue
+ if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+ int out_idx = atomicAdd(output_queue, 1);
+ output_queue[out_idx] = photon_id;
+ }
} // propagate
} // extern "c"