summaryrefslogtreecommitdiff
path: root/src/kernel.cu
diff options
context:
space:
mode:
authorStan Seibert <stan@mtrr.org>2011-08-12 20:10:03 -0400
committerStan Seibert <stan@mtrr.org>2011-08-12 20:10:03 -0400
commit955bebbc1d6121823f4376115a070112ede7bcbe (patch)
tree53a1b820cb4f2a7c837ec00b29e8595d8faa33b4 /src/kernel.cu
parent3e02fe2a94366f2908006d41f7609f0e9555315b (diff)
downloadchroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.gz
chroma-955bebbc1d6121823f4376115a070112ede7bcbe.tar.bz2
chroma-955bebbc1d6121823f4376115a070112ede7bcbe.zip
Use an input and output photon queue in order to consolidate all the
photons that didn't die during propagation into the beginning of the list. This speeds up propagation by reducing the number of partially filled CUDA warps on the next propagation step. 2.2 million photons/sec on LBNE!
Diffstat (limited to 'src/kernel.cu')
-rw-r--r--src/kernel.cu28
1 files changed, 25 insertions, 3 deletions
diff --git a/src/kernel.cu b/src/kernel.cu
index fe518f6..f60ecb1 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,24 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
} // ray_trace
-__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states,
+__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b)
+{
+ int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+ if (id < nswap) {
+ int a = offset_a[id];
+ int b = offset_b[id];
+
+ int tmp = values[a];
+ values[a] = values[b];
+ values[b] = tmp;
+ }
+}
+
+__global__ void propagate(int first_photon, int nthreads,
+ unsigned int *input_queue,
+ unsigned int *output_queue,
+ curandState *rng_states,
float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times,
unsigned int *histories, int *last_hit_triangles, int max_steps)
{
@@ -271,7 +288,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
curandState rng = rng_states[id];
- int photon_id = photon_offsets[first_photon + id];
+ int photon_id = input_queue[first_photon + id];
Photon p;
p.position = positions[photon_id];
@@ -332,7 +349,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c
times[photon_id] = p.time;
histories[photon_id] = p.history;
last_hit_triangles[photon_id] = p.last_hit_triangle;
-
+
+ // Not done, put photon in output queue
+ if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) {
+ int out_idx = atomicAdd(output_queue, 1);
+ output_queue[out_idx] = photon_id;
+ }
} // propagate
} // extern "c"