2 files changed, 86 insertions, 18 deletions
diff --git a/gpu.py b/gpu.py
index f0cf19e..c9e6269 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,26 @@ from color import map_to_color
 
 cuda.init()
 
+def boolean_argsort(condition):
+    '''Returns two arrays of indicies indicating which elements of the
+    boolean array `condition` should be exchanged so that all of the
+    True entries occur before the false entries.
+
+       Returns: tuple (a,b, ntrue) which give the indices for elements
+       to exchange in a and b, and the number of true entries in the array
+       in ntrue.  This function in general requires fewer swaps than 
+       numpy.argsort.
+    '''
+
+    true_indices = condition.nonzero()[0][::-1] # reverse
+    false_indices = (~condition).nonzero()[0]
+    
+    length = min(len(true_indices), len(false_indices))
+    
+    cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
+    return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
+
+
 def chunk_iterator(nelements, nthreads_per_block, max_blocks):
     '''Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
@@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks):
     first = 0
     while first < nelements:
         elements_left = nelements - first
-        blocks = elements_left // nthreads_per_block
+        blocks = int(elements_left // nthreads_per_block)
         if elements_left % nthreads_per_block != 0:
             blocks += 1 #Round up only if needed
         blocks = min(max_blocks, blocks)
@@ -80,7 +100,7 @@ class GPU(object):
         self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True)
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
-        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
+        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
         self.nthread_per_block = 64
         self.max_blocks = 1024
         self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True)
@@ -230,7 +250,6 @@ class GPU(object):
         self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
         self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
         self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
-        self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
 
         if histories is not None:
             self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -250,18 +269,45 @@ class GPU(object):
         May be called repeatedly without reloading photon information if single-stepping
         through photon history.
         '''
-        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
-            self.prop_funcs.propagate(np.int32(first_photon),
-                                      np.int32(photons_this_round),
-                                      self.photon_offsets_gpu,
-                                      self.rng_states_gpu, self.positions_gpu, 
-                                      self.directions_gpu, 
-                                      self.wavelengths_gpu, self.polarizations_gpu, 
-                                      self.times_gpu, self.histories_gpu, 
-                                      self.last_hit_triangles_gpu, 
-                                      np.int32(max_steps), 
-                                      block=(self.nthread_per_block,1,1),
-                                      grid=(blocks, 1))
+        nphotons = self.nphotons
+        step = 0
+        input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+        input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
+        input_queue_gpu = gpuarray.to_gpu(input_queue)
+        output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+        output_queue[0] = 1
+        output_queue_gpu = gpuarray.to_gpu(output_queue)
+
+
+        while step < max_steps:
+            ## Just finish the rest of the steps if the # of photons is low
+            if nphotons < self.nthread_per_block * 16 * 8:
+                nsteps = max_steps - step
+            else:
+                nsteps = 1
+
+            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks):
+                self.prop_funcs.propagate(np.int32(first_photon),
+                                          np.int32(photons_this_round),
+                                          input_queue_gpu[1:],
+                                          output_queue_gpu,
+                                          self.rng_states_gpu, self.positions_gpu, 
+                                          self.directions_gpu, 
+                                          self.wavelengths_gpu, self.polarizations_gpu, 
+                                          self.times_gpu, self.histories_gpu, 
+                                          self.last_hit_triangles_gpu, 
+                                          np.int32(nsteps), 
+                                          block=(self.nthread_per_block,1,1),
+                                          grid=(blocks, 1))
+        
+            step += nsteps
+
+            if step < max_steps:
+                temp = input_queue_gpu
+                input_queue_gpu = output_queue_gpu
+                output_queue_gpu = temp
+                output_queue_gpu[:1].set(np.uint32(1))
+                nphotons = input_queue_gpu[:1].get()[0]
 
         if 'profile' in __builtins__:
             self.context.synchronize()
diff --git a/src/kernel.cu b/src/kernel.cu
index a26460a..cc567b8 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -292,7 +292,24 @@ __global__ void ray_trace_alpha(int nthreads, float3 *positions, float3 *directi
 
 } // ray_trace
 
-__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, 
+__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
+{
+  int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+  if (id < nswap) {
+    int a = offset_a[id];
+    int b = offset_b[id];
+
+    int tmp = values[a];
+    values[a] = values[b];
+    values[b] = tmp;
+  }
+}
+
+__global__ void propagate(int first_photon, int nthreads, 
+			  unsigned int *input_queue, 
+			  unsigned int *output_queue,
+			  curandState *rng_states, 
 			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
 			  unsigned int *histories, int *last_hit_triangles, int max_steps)
 {
@@ -303,7 +320,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 
 	curandState rng = rng_states[id];
 
-	int photon_id = photon_offsets[first_photon + id];
+	int photon_id = input_queue[first_photon + id];
 
 	Photon p;
 	p.position = positions[photon_id];
@@ -364,7 +381,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 	times[photon_id] = p.time;
 	histories[photon_id] = p.history;
 	last_hit_triangles[photon_id] = p.last_hit_triangle;
-
+	
+	// Not done, put photon in output queue
+	if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	  int out_idx = atomicAdd(output_queue, 1);
+	  output_queue[out_idx] = photon_id;
+	}
 } // propagate
 
 } // extern "c"