From 955bebbc1d6121823f4376115a070112ede7bcbe Mon Sep 17 00:00:00 2001 From: Stan Seibert Date: Fri, 12 Aug 2011 20:10:03 -0400 Subject: Use an input and output photon queue in order to consolidate all the photons that didn't die during propagation into the beginning of the list. This speeds up propagation by reducing the number of partially filled CUDA warps on the next propagation step. 2.2 million photons/sec on LBNE! --- src/kernel.cu | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/kernel.cu b/src/kernel.cu index fe518f6..f60ecb1 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -260,7 +260,24 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i } // ray_trace -__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, +__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id < nswap) { + int a = offset_a[id]; + int b = offset_b[id]; + + int tmp = values[a]; + values[a] = values[b]; + values[b] = tmp; + } +} + +__global__ void propagate(int first_photon, int nthreads, + unsigned int *input_queue, + unsigned int *output_queue, + curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps) { @@ -271,7 +288,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c curandState rng = rng_states[id]; - int photon_id = photon_offsets[first_photon + id]; + int photon_id = input_queue[first_photon + id]; Photon p; p.position = positions[photon_id]; @@ -332,7 +349,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c times[photon_id] = p.time; histories[photon_id] = p.history; last_hit_triangles[photon_id] = p.last_hit_triangle; - + + // Not done, put photon in output queue + if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) { + int out_idx = atomicAdd(output_queue, 1); + output_queue[out_idx] = photon_id; + } } // propagate } // extern "c" -- cgit