Use an input and output photon queue in order to consolidate all the

photons that didn't die during propagation into the beginning of the list. This speeds up propagation by reducing the number of partially filled CUDA warps on the next propagation step. 2.2 million photons/sec on LBNE!
author: Stan Seibert <stan@mtrr.org> 2011-08-12 20:10:03 -0400
committer: Stan Seibert <stan@mtrr.org> 2011-08-12 20:10:03 -0400
commit: 955bebbc1d6121823f4376115a070112ede7bcbe (patch)
tree: 53a1b820cb4f2a7c837ec00b29e8595d8faa33b4 /src/kernel.cu
parent: 3e02fe2a94366f2908006d41f7609f0e9555315b (diff)
download: chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.gz
chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.bz2
chroma-955bebbc1d6121823f4376115a070112ede7bcbe.zip
1 files changed, 25 insertions, 3 deletions
diff --git a/src/kernel.cu b/src/kernel.cu
index fe518f6..f60ecb1 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,24 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
 
 } // ray_trace
 
-__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, 
+__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
+{
+  int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+  if (id < nswap) {
+    int a = offset_a[id];
+    int b = offset_b[id];
+
+    int tmp = values[a];
+    values[a] = values[b];
+    values[b] = tmp;
+  }
+}
+
+__global__ void propagate(int first_photon, int nthreads, 
+			  unsigned int *input_queue, 
+			  unsigned int *output_queue,
+			  curandState *rng_states, 
 			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
 			  unsigned int *histories, int *last_hit_triangles, int max_steps)
 {
@@ -271,7 +288,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 
 	curandState rng = rng_states[id];
 
-	int photon_id = photon_offsets[first_photon + id];
+	int photon_id = input_queue[first_photon + id];
 
 	Photon p;
 	p.position = positions[photon_id];
@@ -332,7 +349,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
 	times[photon_id] = p.time;
 	histories[photon_id] = p.history;
 	last_hit_triangles[photon_id] = p.last_hit_triangle;
-
+	
+	// Not done, put photon in output queue
+	if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+	  int out_idx = atomicAdd(output_queue, 1);
+	  output_queue[out_idx] = photon_id;
+	}
 } // propagate
 
 } // extern "c"
author	Stan Seibert <stan@mtrr.org>	2011-08-12 20:10:03 -0400
committer	Stan Seibert <stan@mtrr.org>	2011-08-12 20:10:03 -0400
commit	955bebbc1d6121823f4376115a070112ede7bcbe (patch)
tree	53a1b820cb4f2a7c837ec00b29e8595d8faa33b4 /src/kernel.cu
parent	3e02fe2a94366f2908006d41f7609f0e9555315b (diff)
download	chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.gz chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.bz2 chroma-955bebbc1d6121823f4376115a070112ede7bcbe.zip