From 955bebbc1d6121823f4376115a070112ede7bcbe Mon Sep 17 00:00:00 2001
From: Stan Seibert <stan@mtrr.org>
Date: Fri, 12 Aug 2011 20:10:03 -0400
Subject: Use an input and output photon queue in order to consolidate all the
 photons that didn't die during propagation into the beginning of the list. 
 This speeds up propagation by reducing the number of partially filled CUDA
 warps on the next propagation step.

2.2 million photons/sec on LBNE!
---
 gpu.py        | 76 +++++++++++++++++++++++++++++++++++++++++++++++------------
 src/kernel.cu | 28 +++++++++++++++++++---
 2 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/gpu.py b/gpu.py
index f0cf19e..c9e6269 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,26 @@ from color import map_to_color
 
 cuda.init()
 
+def boolean_argsort(condition):
+    '''Returns two arrays of indicies indicating which elements of the
+    boolean array `condition` should be exchanged so that all of the
+    True entries occur before the false entries.
+
+       Returns: tuple (a,b, ntrue) which give the indices for elements
+       to exchange in a and b, and the number of true entries in the array
+       in ntrue.  This function in general requires fewer swaps than 
+       numpy.argsort.
+    '''
+
+    true_indices = condition.nonzero()[0][::-1] # reverse
+    false_indices = (~condition).nonzero()[0]
+    
+    length = min(len(true_indices), len(false_indices))
+    
+    cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
+    return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
+
+
 def chunk_iterator(nelements, nthreads_per_block, max_blocks):
     '''Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
@@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks):
     first = 0
     while first < nelements:
         elements_left = nelements - first
-        blocks = elements_left // nthreads_per_block
+        blocks = int(elements_left // nthreads_per_block)
         if elements_left % nthreads_per_block != 0:
             blocks += 1 #Round up only if needed
         blocks = min(max_blocks, blocks)
@@ -80,7 +100,7 @@ class GPU(object):
         self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True)
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
-        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
+        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
         self.nthread_per_block = 64
         self.max_blocks = 1024
         self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True)
@@ -230,7 +250,6 @@ class GPU(object):
         self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
         self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
         self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
-        self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
 
         if histories is not None:
             self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -250,18 +269,45 @@ class GPU(object):
         May be called repeatedly without reloading photon information if single-stepping
         through photon history.
         '''
-        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
-            self.prop_funcs.propagate(np.int32(first_photon),
-                                      np.int32(photons_this_round),
-                                      self.photon_offsets_gpu,
-                                      self.rng_states_gpu, self.positions_gpu, 
-                                      self.directions_gpu, 
-                                      self.wavelengths_gpu, self.polarizations_gpu, 
-                                      self.times_gpu, self.histories_gpu, 
-                                      self.last_hit_triangles_gpu, 
-                                      np.int32(max_steps), 
-                                      block=(self.nthread_per_block,1,1),
-                                      grid=(blocks, 1))
+        nphotons = self.nphotons
+        step = 0
+        input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+        input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
+        input_queue_gpu = gpuarray.to_gpu(input_queue)
+        output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+        output_queue[0] = 1
+        output_queue_gpu = gpuarray.to_gpu(output_queue)
+
+
+        while step < max_steps:
+            ## Just finish the rest of the steps if the # of photons is low
+            if nphotons < self.nthread_per_block * 16 * 8:
+                nsteps = max_steps - step
+            else:
+                nsteps = 1
+
+            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks):
+                self.prop_funcs.propagate(np.int32(first_photon),
+                                          np.int32(photons_this_round),
+                                          input_queue_gpu[1:],
+                                          output_queue_gpu,
+                                          self.rng_states_gpu, self.positions_gpu, 
+                                          self.directions_gpu, 
+                                          self.wavelengths_gpu, self.polarizations_gpu, 
+                                          self.times_gpu, self.histories_gpu, 
+                                          self.last_hit_triangles_gpu, 
+                                          np.int32(nsteps), 
+                                          block=(self.nthread_per_block,1,1),
+                                          grid=(blocks, 1))
+        
+            step += nsteps
+
+            if step < max_steps:
+                temp = input_queue_gpu
+                input_queue_gpu = output_queue_gpu
+                output_queue_gpu = temp
+                output_queue_gpu[:1].set(np.uint32(1))
+                nphotons = input_queue_gpu[:1].get()[0]
 
         if 'profile' in __builtins__:
             self.context.synchronize()
diff --git a/src/kernel.cu b/src/kernel.cu
index fe518f6..f60ecb1 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,24 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
 
 } // ray_trace
 
-__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, 
+__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
+{
+  int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+  if (id < nswap) {
+    int a = offset_a[id];
+    int b = offset_b[id];
+
+    int tmp = values[a];
+    values[a] = values[b];
+    values[b] = tmp;
+  }
+}
+
+__global__ void propagate(int first_photon, int nthreads, 
+			  unsigned int *input_queue, 
+			  unsigned int *output_queue,
+			  curandState *rng_states, 
 			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
 			  unsigned int *histories, int *last_hit_triangles, int max_steps)
 {
@@ -271,7 +288,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 
 	curandState rng = rng_states[id];
 
-	int photon_id = photon_offsets[first_photon + id];
+	int photon_id = input_queue[first_photon + id];
 
 	Photon p;
 	p.position = positions[photon_id];
@@ -332,7 +349,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 	times[photon_id] = p.time;
 	histories[photon_id] = p.history;
 	last_hit_triangles[photon_id] = p.last_hit_triangle;
-
+	
+	// Not done, put photon in output queue
+	if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	  int out_idx = atomicAdd(output_queue, 1);
+	  output_queue[out_idx] = photon_id;
+	}
 } // propagate
 
 } // extern "c"
-- 
cgit