3 files changed, 83 insertions, 49 deletions
diff --git a/gpu.py b/gpu.py
index 8de0f90..a21bc38 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,28 @@ from color import map_to_color
 
 cuda.init()
 
+def chunk_iterator(nelements, nthreads_per_block, max_blocks):
+    '''Iterator that yields tuples with the values requried to process
+    a long array in multiple kernel passes on the GPU.
+
+       Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+
+    >>> list(chunk_iterator(300, 32, 2))
+    [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+    '''
+    first = 0
+    while first < nelements:
+        elements_left = nelements - first
+        blocks = elements_left // nthreads_per_block
+        if elements_left % nthreads_per_block != 0:
+            blocks += 1 #Round up only if needed
+        blocks = min(max_blocks, blocks)
+        elements_this_round = min(elements_left, blocks * nthreads_per_block)
+
+        yield (first, elements_this_round, blocks)
+        first += elements_this_round
+    
+
 def format_size(size):
     if size < 1e3:
         return '%.1f%s' % (size, ' ')
@@ -54,8 +76,8 @@ class GPU(object):
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
         self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
-        self.nblocks = 64
-        self.max_nthreads = 200000
+        self.nthread_per_block = 64
+        self.max_blocks = 1024
         self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
         self.daq_funcs = CUDAFuncs(self.daq_module, 
                               ['reset_earliest_time_int', 'run_daq',
@@ -166,15 +188,16 @@ class GPU(object):
         solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32))
         solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
 
-        self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblocks,1,1), grid=(solid_ids_gpu.size//self.nblocks+1,1))
+        self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblock,1,1), grid=(solid_ids_gpu.size//self.nthread_per_block+1,1))
         self.context.synchronize()
 
     def setup_propagate(self, seed=1):
-        self.rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.prop_funcs.init_rng(np.int32(self.max_nthreads), self.rng_states_gpu, 
+        self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks
+                                             * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+        self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu, 
                                  np.uint64(seed), np.uint64(0), 
-                                 block=(self.nblocks,1,1), 
-                                 grid=(self.max_nthreads//self.nblocks+1,1))
+                                 block=(self.nthread_per_block,1,1), 
+                                 grid=(self.max_blocks,1))
         #self.context.synchronize()
 
     def load_photons(self, pos, dir, pol, wavelength, t0, 
@@ -193,7 +216,6 @@ class GPU(object):
                                if any.  -1 if no triangle was hit in the last step
         '''
         self.nphotons = len(pos)
-        assert self.nphotons < self.max_nthreads
         assert len(dir) == self.nphotons
         assert len(pol) == self.nphotons
         assert len(wavelength) == self.nphotons
@@ -204,6 +226,7 @@ class GPU(object):
         self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
         self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
         self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
+        self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
 
         if histories is not None:
             self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -216,8 +239,6 @@ class GPU(object):
             self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
             self.last_hit_triangles_gpu.fill(-1)
 
-        self.per_photon_kernel_config = {'block': (self.nblocks,1,1), 
-                                         'grid': (self.nphotons//self.nblocks+1,1)}
 
     def propagate(self, max_steps=10):
         '''Propagate photons on GPU to termination or max_steps, whichever comes first.
@@ -225,13 +246,19 @@ class GPU(object):
         May be called repeatedly without reloading photon information if single-stepping
         through photon history.
         '''
-        self.prop_funcs.propagate(np.int32(self.nphotons), 
-                                 self.rng_states_gpu, self.positions_gpu, 
-                                 self.directions_gpu, 
-                                 self.wavelengths_gpu, self.polarizations_gpu, 
-                                 self.times_gpu, self.histories_gpu, 
-                                 self.last_hit_triangles_gpu, 
-                                 np.int32(max_steps), **self.per_photon_kernel_config)
+        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+            self.prop_funcs.propagate(np.int32(first_photon),
+                                      np.int32(photons_this_round),
+                                      self.photon_offsets_gpu,
+                                      self.rng_states_gpu, self.positions_gpu, 
+                                      self.directions_gpu, 
+                                      self.wavelengths_gpu, self.polarizations_gpu, 
+                                      self.times_gpu, self.histories_gpu, 
+                                      self.last_hit_triangles_gpu, 
+                                      np.int32(max_steps), 
+                                      block=(self.nthread_per_block,1,1),
+                                      grid=(blocks, 1))
+
         #self.context.synchronize()
 
     def get_photons(self):
@@ -257,24 +284,26 @@ class GPU(object):
         self.daq_funcs.reset_earliest_time_int(np.float32(1e9), 
                                                np.int32(len(self.earliest_time_int_gpu)), 
                                                self.earliest_time_int_gpu, 
-                                               block=(self.nblocks,1,1), 
-                                               grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+                                               block=(self.nthread_per_block,1,1), 
+                                               grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
         #self.context.synchronize()
-        self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
-                               np.float32(self.daq_pmt_rms), 
-                               np.int32(len(self.positions_gpu)), 
-                               self.times_gpu, self.histories_gpu,
-                               self.last_hit_triangles_gpu, 
-                               self.solid_id_map_gpu, 
-                               np.int32(len(self.earliest_time_int_gpu)), 
-                               self.earliest_time_int_gpu, 
-                               block=(self.nblocks,1,1), grid=(self.nphotons//self.nblocks+1,1))
+        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+            self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
+                                   np.float32(self.daq_pmt_rms), 
+                                   np.int32(first_photon),
+                                   np.int32(photons_this_round), 
+                                   self.times_gpu, self.histories_gpu,
+                                   self.last_hit_triangles_gpu, 
+                                   self.solid_id_map_gpu, 
+                                   np.int32(len(self.earliest_time_int_gpu)), 
+                                   self.earliest_time_int_gpu, 
+                                   block=(self.nthread_per_block,1,1), grid=(blocks,1))
         #self.context.synchronize()
         self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), 
                                                      self.earliest_time_int_gpu, 
                                                      self.earliest_time_gpu, 
-                                                     block=(self.nblocks,1,1), 
-                                                     grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+                                                     block=(self.nthread_per_block,1,1), 
+                                                     grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
         #self.context.synchronize()
         
 
diff --git a/src/daq.cu b/src/daq.cu
index 2b5f9b4..7c5e6a5 100644
--- a/src/daq.cu
+++ b/src/daq.cu
@@ -30,6 +30,7 @@ __global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned
 
 __global__ void run_daq(curandState *s, unsigned int detection_state,
 			float time_rms,
+			int first_photon,
 			int nphotons, float *photon_times,
 			unsigned int *photon_histories,
 			int *last_hit_triangles, int *solid_map,
@@ -41,17 +42,17 @@ __global__ void run_daq(curandState *s, unsigned int detection_state,
 	if (id < nphotons)
 	{
 		curandState rng = s[id];
-
-		int triangle_id = last_hit_triangles[id];
+		int photon_id = id + first_photon;
+		int triangle_id = last_hit_triangles[photon_id];
 		
 		if (triangle_id > -1)
 		{
 			int solid_id = solid_map[triangle_id];
-			int history = photon_histories[id];
+			int history = photon_histories[photon_id];
 
 			if (solid_id < nsolids && (history & detection_state)) 
 			{
-			  float time = photon_times[id] + curand_normal(&rng) * time_rms;
+			  float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
 			  unsigned int time_int = float_to_sortable_int(time);
 			  
 			  atomicMin(earliest_time_int + solid_id, time_int);
diff --git a/src/kernel.cu b/src/kernel.cu
index 82efe88..17b829c 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,9 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
 
 } // ray_trace
 
-__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps)
+__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, 
+			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
+			  unsigned int *histories, int *last_hit_triangles, int max_steps)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -269,16 +271,18 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio
 
 	curandState rng = rng_states[id];
 
+	int photon_id = photon_offsets[first_photon + id];
+
 	Photon p;
-	p.position = positions[id];
-	p.direction = directions[id];
+	p.position = positions[photon_id];
+	p.direction = directions[photon_id];
 	p.direction /= norm(p.direction);
-	p.polarization = polarizations[id];
+	p.polarization = polarizations[photon_id];
 	p.polarization /= norm(p.polarization);
-	p.wavelength = wavelengths[id];
-	p.time = times[id];
-	p.last_hit_triangle = last_hit_triangles[id];
-	p.history = histories[id];
+	p.wavelength = wavelengths[photon_id];
+	p.time = times[photon_id];
+	p.last_hit_triangle = last_hit_triangles[photon_id];
+	p.history = histories[photon_id];
 
 	if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
 		return;
@@ -321,13 +325,13 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio
 	} // while (steps < max_steps)
 
 	rng_states[id] = rng;
-	positions[id] = p.position;
-	directions[id] = p.direction;
-	polarizations[id] = p.polarization;
-	wavelengths[id] = p.wavelength;
-	times[id] = p.time;
-	histories[id] = p.history;
-	last_hit_triangles[id] = p.last_hit_triangle;
+	positions[photon_id] = p.position;
+	directions[photon_id] = p.direction;
+	polarizations[photon_id] = p.polarization;
+	wavelengths[photon_id] = p.wavelength;
+	times[photon_id] = p.time;
+	histories[photon_id] = p.history;
+	last_hit_triangles[photon_id] = p.last_hit_triangle;
 
 } // propagate