3 files changed, 37 insertions, 5 deletions
diff --git a/chroma/cuda/photon.h b/chroma/cuda/photon.h
index 79a87f0..371c122 100644
--- a/chroma/cuda/photon.h
+++ b/chroma/cuda/photon.h
@@ -180,7 +180,7 @@ rayleigh_scatter(Photon &p, curandState &rng)
 } // scatter
 
 __device__ int propagate_to_boundary(Photon &p, State &s, curandState &rng,
-                                     bool use_weights=false)
+                                     bool use_weights=false, int scatter_first=0)
 {
     float absorption_distance = -s.absorption_length*logf(curand_uniform(&rng));
     float scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
@@ -190,6 +190,35 @@ __device__ int propagate_to_boundary(Photon &p, State &s, curandState &rng,
     else
 	use_weights = false;
 
+    if (scatter_first == 1) {
+	// Force scatter
+	float scatter_prob = 1.0f - expf(-s.distance_to_boundary/s.scattering_length);
+
+	if (scatter_prob > WEIGHT_LOWER_THRESHOLD) {
+	    int i=0;
+	    const int max_i = 1000;
+	    while (i < max_i && scattering_distance > s.distance_to_boundary) {
+		scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
+		i++;
+	    }
+	    p.weight *= scatter_prob;
+	}
+
+    } else if (scatter_first == -1) {
+	// Prevent scatter
+	float no_scatter_prob = expf(-s.distance_to_boundary/s.scattering_length);
+
+	if (no_scatter_prob > WEIGHT_LOWER_THRESHOLD) {
+	    int i=0;
+	    const int max_i = 1000;
+	    while (i < max_i && scattering_distance <= s.distance_to_boundary) {
+		scattering_distance = -s.scattering_length*logf(curand_uniform(&rng));
+		i++;
+	    }
+	    p.weight *= no_scatter_prob;
+	}
+    }
+
     if (absorption_distance <= scattering_distance) {
 	if (absorption_distance <= s.distance_to_boundary) {
 	    p.time += absorption_distance/(SPEED_OF_LIGHT/s.refractive_index1);
diff --git a/chroma/cuda/propagate.cu b/chroma/cuda/propagate.cu
index 29f8aea..9520cf8 100644
--- a/chroma/cuda/propagate.cu
+++ b/chroma/cuda/propagate.cu
@@ -116,7 +116,7 @@ propagate(int first_photon, int nthreads, unsigned int *input_queue,
 	  float *wavelengths, float3 *polarizations,
 	  float *times, unsigned int *histories,
 	  int *last_hit_triangles, float *weights,
-	  int max_steps, int use_weights,
+	  int max_steps, int use_weights, int scatter_first,
 	  Geometry *g)
 {
     __shared__ Geometry sg;
@@ -171,7 +171,8 @@ propagate(int first_photon, int nthreads, unsigned int *input_queue,
 	if (p.last_hit_triangle == -1)
 	    break;
 
-	command = propagate_to_boundary(p, s, rng, use_weights);
+	command = propagate_to_boundary(p, s, rng, use_weights, scatter_first);
+	scatter_first = 0; // Only use the scatter_first value once
 
 	if (command == BREAK)
 	    break;
diff --git a/chroma/gpu/photon.py b/chroma/gpu/photon.py
index 83dfcd4..fbb064b 100644
--- a/chroma/gpu/photon.py
+++ b/chroma/gpu/photon.py
@@ -95,7 +95,8 @@ class GPUPhotons(object):
 
     @profile_if_possible
     def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
-                  max_blocks=1024, max_steps=10, use_weights=False):
+                  max_blocks=1024, max_steps=10, use_weights=False,
+                  scatter_first=0):
         """Propagate photons on GPU to termination or max_steps, whichever
         comes first.
 
@@ -127,9 +128,10 @@ class GPUPhotons(object):
 
             for first_photon, photons_this_round, blocks in \
                     chunk_iterator(nphotons, nthreads_per_block, max_blocks):
-                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
+                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))
 
             step += nsteps
+            scatter_first = 0 # Only allow non-zero in first pass
 
             if step < max_steps:
                 temp = input_queue_gpu