Use an input and output photon queue in order to consolidate all the

photons that didn't die during propagation into the beginning of the list. This speeds up propagation by reducing the number of partially filled CUDA warps on the next propagation step. 2.2 million photons/sec on LBNE!
author: Stan Seibert <stan@mtrr.org> 2011-08-12 20:10:03 -0400
committer: Stan Seibert <stan@mtrr.org> 2011-08-12 20:10:03 -0400
commit: 955bebbc1d6121823f4376115a070112ede7bcbe (patch)
tree: 53a1b820cb4f2a7c837ec00b29e8595d8faa33b4
parent: 3e02fe2a94366f2908006d41f7609f0e9555315b (diff)
download: chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.gz
chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.bz2
chroma-955bebbc1d6121823f4376115a070112ede7bcbe.zip
2 files changed, 86 insertions, 18 deletions
diff --git a/gpu.py b/gpu.py
index f0cf19e..c9e6269 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,26 @@ from color import map_to_color
 
 cuda.init()
 
+def boolean_argsort(condition):
+    '''Returns two arrays of indicies indicating which elements of the
+    boolean array `condition` should be exchanged so that all of the
+    True entries occur before the false entries.
+
+       Returns: tuple (a,b, ntrue) which give the indices for elements
+       to exchange in a and b, and the number of true entries in the array
+       in ntrue.  This function in general requires fewer swaps than 
+       numpy.argsort.
+    '''
+
+    true_indices = condition.nonzero()[0][::-1] # reverse
+    false_indices = (~condition).nonzero()[0]
+    
+    length = min(len(true_indices), len(false_indices))
+    
+    cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True)
+    return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices))
+
+
 def chunk_iterator(nelements, nthreads_per_block, max_blocks):
     '''Iterator that yields tuples with the values requried to process
     a long array in multiple kernel passes on the GPU.
@@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks):
     first = 0
     while first < nelements:
         elements_left = nelements - first
-        blocks = elements_left // nthreads_per_block
+        blocks = int(elements_left // nthreads_per_block)
         if elements_left % nthreads_per_block != 0:
             blocks += 1 #Round up only if needed
         blocks = min(max_blocks, blocks)
@@ -80,7 +100,7 @@ class GPU(object):
         self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True)
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
-        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
+        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
         self.nthread_per_block = 64
         self.max_blocks = 1024
         self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True)
@@ -230,7 +250,6 @@ class GPU(object):
         self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
         self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
         self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
-        self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
 
         if histories is not None:
             self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -250,18 +269,45 @@ class GPU(object):
         May be called repeatedly without reloading photon information if single-stepping
         through photon history.
         '''
-        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
-            self.prop_funcs.propagate(np.int32(first_photon),
-                                      np.int32(photons_this_round),
-                                      self.photon_offsets_gpu,
-                                      self.rng_states_gpu, self.positions_gpu, 
-                                      self.directions_gpu, 
-                                      self.wavelengths_gpu, self.polarizations_gpu, 
-                                      self.times_gpu, self.histories_gpu, 
-                                      self.last_hit_triangles_gpu, 
-                                      np.int32(max_steps), 
-                                      block=(self.nthread_per_block,1,1),
-                                      grid=(blocks, 1))
+        nphotons = self.nphotons
+        step = 0
+        input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+        input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32)
+        input_queue_gpu = gpuarray.to_gpu(input_queue)
+        output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32)
+        output_queue[0] = 1
+        output_queue_gpu = gpuarray.to_gpu(output_queue)
+
+
+        while step < max_steps:
+            ## Just finish the rest of the steps if the # of photons is low
+            if nphotons < self.nthread_per_block * 16 * 8:
+                nsteps = max_steps - step
+            else:
+                nsteps = 1
+
+            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks):
+                self.prop_funcs.propagate(np.int32(first_photon),
+                                          np.int32(photons_this_round),
+                                          input_queue_gpu[1:],
+                                          output_queue_gpu,
+                                          self.rng_states_gpu, self.positions_gpu, 
+                                          self.directions_gpu, 
+                                          self.wavelengths_gpu, self.polarizations_gpu, 
+                                          self.times_gpu, self.histories_gpu, 
+                                          self.last_hit_triangles_gpu, 
+                                          np.int32(nsteps), 
+                                          block=(self.nthread_per_block,1,1),
+                                          grid=(blocks, 1))
+        
+            step += nsteps
+
+            if step < max_steps:
+                temp = input_queue_gpu
+                input_queue_gpu = output_queue_gpu
+                output_queue_gpu = temp
+                output_queue_gpu[:1].set(np.uint32(1))
+                nphotons = input_queue_gpu[:1].get()[0]
 
         if 'profile' in __builtins__:
             self.context.synchronize()
diff --git a/src/kernel.cu b/src/kernel.cu
index fe518f6..f60ecb1 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,24 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
 
 } // ray_trace
 
-__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, 
+__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
+{
+  int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+  if (id < nswap) {
+    int a = offset_a[id];
+    int b = offset_b[id];
+
+    int tmp = values[a];
+    values[a] = values[b];
+    values[b] = tmp;
+  }
+}
+
+__global__ void propagate(int first_photon, int nthreads, 
+			  unsigned int *input_queue, 
+			  unsigned int *output_queue,
+			  curandState *rng_states, 
 			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
 			  unsigned int *histories, int *last_hit_triangles, int max_steps)
 {
@@ -271,7 +288,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 
 	curandState rng = rng_states[id];
 
-	int photon_id = photon_offsets[first_photon + id];
+	int photon_id = input_queue[first_photon + id];
 
 	Photon p;
 	p.position = positions[photon_id];
@@ -332,7 +349,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 	times[photon_id] = p.time;
 	histories[photon_id] = p.history;
 	last_hit_triangles[photon_id] = p.last_hit_triangle;
-
+	
+	// Not done, put photon in output queue
+	if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	  int out_idx = atomicAdd(output_queue, 1);
+	  output_queue[out_idx] = photon_id;
+	}
 } // propagate
 
 } // extern "c"
author	Stan Seibert <stan@mtrr.org>	2011-08-12 20:10:03 -0400
committer	Stan Seibert <stan@mtrr.org>	2011-08-12 20:10:03 -0400
commit	955bebbc1d6121823f4376115a070112ede7bcbe (patch)
tree	53a1b820cb4f2a7c837ec00b29e8595d8faa33b4
parent	3e02fe2a94366f2908006d41f7609f0e9555315b (diff)
download	chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.gz chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.bz2 chroma-955bebbc1d6121823f4376115a070112ede7bcbe.zip