propagate() takes an array of photon offsets and a range of

offsets to load. Now events with more photons than RNG states can be propagated through multiple kernel calls. Also lays the groundwork for consolidating photons between steps to reduce the amount of propagation work required.
author: Stan Seibert <stan@mtrr.org> 2011-08-08 20:16:33 -0400
committer: Stan Seibert <stan@mtrr.org> 2011-08-08 20:16:33 -0400
commit: edd78c209c88a652691cc5602b37d79085fee795 (patch)
tree: dfd12bb15893dbbea5d5c907d12c9d01f6fee9a0
parent: 2e80fdc7e7dbb84da0f37bb159e08ed618fe15f5 (diff)
download: chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.gz
chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.bz2
chroma-edd78c209c88a652691cc5602b37d79085fee795.zip
3 files changed, 83 insertions, 49 deletions
diff --git a/gpu.py b/gpu.py
index 8de0f90..a21bc38 100644
--- a/gpu.py
+++ b/gpu.py
@@ -15,6 +15,28 @@ from color import map_to_color
 
 cuda.init()
 
+def chunk_iterator(nelements, nthreads_per_block, max_blocks):
+    '''Iterator that yields tuples with the values requried to process
+    a long array in multiple kernel passes on the GPU.
+
+       Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration)
+
+    >>> list(chunk_iterator(300, 32, 2))
+    [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)]
+    '''
+    first = 0
+    while first < nelements:
+        elements_left = nelements - first
+        blocks = elements_left // nthreads_per_block
+        if elements_left % nthreads_per_block != 0:
+            blocks += 1 #Round up only if needed
+        blocks = min(max_blocks, blocks)
+        elements_this_round = min(elements_left, blocks * nthreads_per_block)
+
+        yield (first, elements_this_round, blocks)
+        first += elements_this_round
+    
+
 def format_size(size):
     if size < 1e3:
         return '%.1f%s' % (size, ' ')
@@ -54,8 +76,8 @@ class GPU(object):
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
         self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate'])
-        self.nblocks = 64
-        self.max_nthreads = 200000
+        self.nthread_per_block = 64
+        self.max_blocks = 1024
         self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
         self.daq_funcs = CUDAFuncs(self.daq_module, 
                               ['reset_earliest_time_int', 'run_daq',
@@ -166,15 +188,16 @@ class GPU(object):
         solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32))
         solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
 
-        self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblocks,1,1), grid=(solid_ids_gpu.size//self.nblocks+1,1))
+        self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblock,1,1), grid=(solid_ids_gpu.size//self.nthread_per_block+1,1))
         self.context.synchronize()
 
     def setup_propagate(self, seed=1):
-        self.rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.prop_funcs.init_rng(np.int32(self.max_nthreads), self.rng_states_gpu, 
+        self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks
+                                             * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+        self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu, 
                                  np.uint64(seed), np.uint64(0), 
-                                 block=(self.nblocks,1,1), 
-                                 grid=(self.max_nthreads//self.nblocks+1,1))
+                                 block=(self.nthread_per_block,1,1), 
+                                 grid=(self.max_blocks,1))
         #self.context.synchronize()
 
     def load_photons(self, pos, dir, pol, wavelength, t0, 
@@ -193,7 +216,6 @@ class GPU(object):
                                if any.  -1 if no triangle was hit in the last step
         '''
         self.nphotons = len(pos)
-        assert self.nphotons < self.max_nthreads
         assert len(dir) == self.nphotons
         assert len(pol) == self.nphotons
         assert len(wavelength) == self.nphotons
@@ -204,6 +226,7 @@ class GPU(object):
         self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))
         self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))
         self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32))
+        self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)
 
         if histories is not None:
             self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32))
@@ -216,8 +239,6 @@ class GPU(object):
             self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32)
             self.last_hit_triangles_gpu.fill(-1)
 
-        self.per_photon_kernel_config = {'block': (self.nblocks,1,1), 
-                                         'grid': (self.nphotons//self.nblocks+1,1)}
 
     def propagate(self, max_steps=10):
         '''Propagate photons on GPU to termination or max_steps, whichever comes first.
@@ -225,13 +246,19 @@ class GPU(object):
         May be called repeatedly without reloading photon information if single-stepping
         through photon history.
         '''
-        self.prop_funcs.propagate(np.int32(self.nphotons), 
-                                 self.rng_states_gpu, self.positions_gpu, 
-                                 self.directions_gpu, 
-                                 self.wavelengths_gpu, self.polarizations_gpu, 
-                                 self.times_gpu, self.histories_gpu, 
-                                 self.last_hit_triangles_gpu, 
-                                 np.int32(max_steps), **self.per_photon_kernel_config)
+        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+            self.prop_funcs.propagate(np.int32(first_photon),
+                                      np.int32(photons_this_round),
+                                      self.photon_offsets_gpu,
+                                      self.rng_states_gpu, self.positions_gpu, 
+                                      self.directions_gpu, 
+                                      self.wavelengths_gpu, self.polarizations_gpu, 
+                                      self.times_gpu, self.histories_gpu, 
+                                      self.last_hit_triangles_gpu, 
+                                      np.int32(max_steps), 
+                                      block=(self.nthread_per_block,1,1),
+                                      grid=(blocks, 1))
+
         #self.context.synchronize()
 
     def get_photons(self):
@@ -257,24 +284,26 @@ class GPU(object):
         self.daq_funcs.reset_earliest_time_int(np.float32(1e9), 
                                                np.int32(len(self.earliest_time_int_gpu)), 
                                                self.earliest_time_int_gpu, 
-                                               block=(self.nblocks,1,1), 
-                                               grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+                                               block=(self.nthread_per_block,1,1), 
+                                               grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
         #self.context.synchronize()
-        self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
-                               np.float32(self.daq_pmt_rms), 
-                               np.int32(len(self.positions_gpu)), 
-                               self.times_gpu, self.histories_gpu,
-                               self.last_hit_triangles_gpu, 
-                               self.solid_id_map_gpu, 
-                               np.int32(len(self.earliest_time_int_gpu)), 
-                               self.earliest_time_int_gpu, 
-                               block=(self.nblocks,1,1), grid=(self.nphotons//self.nblocks+1,1))
+        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+            self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
+                                   np.float32(self.daq_pmt_rms), 
+                                   np.int32(first_photon),
+                                   np.int32(photons_this_round), 
+                                   self.times_gpu, self.histories_gpu,
+                                   self.last_hit_triangles_gpu, 
+                                   self.solid_id_map_gpu, 
+                                   np.int32(len(self.earliest_time_int_gpu)), 
+                                   self.earliest_time_int_gpu, 
+                                   block=(self.nthread_per_block,1,1), grid=(blocks,1))
         #self.context.synchronize()
         self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), 
                                                      self.earliest_time_int_gpu, 
                                                      self.earliest_time_gpu, 
-                                                     block=(self.nblocks,1,1), 
-                                                     grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1))
+                                                     block=(self.nthread_per_block,1,1), 
+                                                     grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
         #self.context.synchronize()
         
 
diff --git a/src/daq.cu b/src/daq.cu
index 2b5f9b4..7c5e6a5 100644
--- a/src/daq.cu
+++ b/src/daq.cu
@@ -30,6 +30,7 @@ __global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned
 
 __global__ void run_daq(curandState *s, unsigned int detection_state,
 			float time_rms,
+			int first_photon,
 			int nphotons, float *photon_times,
 			unsigned int *photon_histories,
 			int *last_hit_triangles, int *solid_map,
@@ -41,17 +42,17 @@ __global__ void run_daq(curandState *s, unsigned int detection_state,
 	if (id < nphotons)
 	{
 		curandState rng = s[id];
-
-		int triangle_id = last_hit_triangles[id];
+		int photon_id = id + first_photon;
+		int triangle_id = last_hit_triangles[photon_id];
 		
 		if (triangle_id > -1)
 		{
 			int solid_id = solid_map[triangle_id];
-			int history = photon_histories[id];
+			int history = photon_histories[photon_id];
 
 			if (solid_id < nsolids && (history & detection_state)) 
 			{
-			  float time = photon_times[id] + curand_normal(&rng) * time_rms;
+			  float time = photon_times[photon_id] + curand_normal(&rng) * time_rms;
 			  unsigned int time_int = float_to_sortable_int(time);
 			  
 			  atomicMin(earliest_time_int + solid_id, time_int);
diff --git a/src/kernel.cu b/src/kernel.cu
index 82efe88..17b829c 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -260,7 +260,9 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
 
 } // ray_trace
 
-__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps)
+__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, 
+			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, 
+			  unsigned int *histories, int *last_hit_triangles, int max_steps)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -269,16 +271,18 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio
 
 	curandState rng = rng_states[id];
 
+	int photon_id = photon_offsets[first_photon + id];
+
 	Photon p;
-	p.position = positions[id];
-	p.direction = directions[id];
+	p.position = positions[photon_id];
+	p.direction = directions[photon_id];
 	p.direction /= norm(p.direction);
-	p.polarization = polarizations[id];
+	p.polarization = polarizations[photon_id];
 	p.polarization /= norm(p.polarization);
-	p.wavelength = wavelengths[id];
-	p.time = times[id];
-	p.last_hit_triangle = last_hit_triangles[id];
-	p.history = histories[id];
+	p.wavelength = wavelengths[photon_id];
+	p.time = times[photon_id];
+	p.last_hit_triangle = last_hit_triangles[photon_id];
+	p.history = histories[photon_id];
 
 	if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB))
 		return;
@@ -321,13 +325,13 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio
 	} // while (steps < max_steps)
 
 	rng_states[id] = rng;
-	positions[id] = p.position;
-	directions[id] = p.direction;
-	polarizations[id] = p.polarization;
-	wavelengths[id] = p.wavelength;
-	times[id] = p.time;
-	histories[id] = p.history;
-	last_hit_triangles[id] = p.last_hit_triangle;
+	positions[photon_id] = p.position;
+	directions[photon_id] = p.direction;
+	polarizations[photon_id] = p.polarization;
+	wavelengths[photon_id] = p.wavelength;
+	times[photon_id] = p.time;
+	histories[photon_id] = p.history;
+	last_hit_triangles[photon_id] = p.last_hit_triangle;
 
 } // propagate
author	Stan Seibert <stan@mtrr.org>	2011-08-08 20:16:33 -0400
committer	Stan Seibert <stan@mtrr.org>	2011-08-08 20:16:33 -0400
commit	edd78c209c88a652691cc5602b37d79085fee795 (patch)
tree	dfd12bb15893dbbea5d5c907d12c9d01f6fee9a0
parent	2e80fdc7e7dbb84da0f37bb159e08ed618fe15f5 (diff)
download	chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.gz chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.bz2 chroma-edd78c209c88a652691cc5602b37d79085fee795.zip