diff options
author | Stan Seibert <stan@mtrr.org> | 2011-08-08 20:16:33 -0400 |
---|---|---|
committer | Stan Seibert <stan@mtrr.org> | 2011-08-08 20:16:33 -0400 |
commit | edd78c209c88a652691cc5602b37d79085fee795 (patch) | |
tree | dfd12bb15893dbbea5d5c907d12c9d01f6fee9a0 | |
parent | 2e80fdc7e7dbb84da0f37bb159e08ed618fe15f5 (diff) | |
download | chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.gz chroma-edd78c209c88a652691cc5602b37d79085fee795.tar.bz2 chroma-edd78c209c88a652691cc5602b37d79085fee795.zip |
propagate() takes an array of photon offsets and a range of
offsets to load. Now events with more photons than RNG states
can be propagated through multiple kernel calls.
Also lays the groundwork for consolidating photons between steps
to reduce the amount of propagation work required.
-rw-r--r-- | gpu.py | 89 | ||||
-rw-r--r-- | src/daq.cu | 9 | ||||
-rw-r--r-- | src/kernel.cu | 34 |
3 files changed, 83 insertions, 49 deletions
@@ -15,6 +15,28 @@ from color import map_to_color cuda.init() +def chunk_iterator(nelements, nthreads_per_block, max_blocks): + '''Iterator that yields tuples with the values requried to process + a long array in multiple kernel passes on the GPU. + + Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration) + + >>> list(chunk_iterator(300, 32, 2)) + [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] + ''' + first = 0 + while first < nelements: + elements_left = nelements - first + blocks = elements_left // nthreads_per_block + if elements_left % nthreads_per_block != 0: + blocks += 1 #Round up only if needed + blocks = min(max_blocks, blocks) + elements_this_round = min(elements_left, blocks * nthreads_per_block) + + yield (first, elements_this_round, blocks) + first += elements_this_round + + def format_size(size): if size < 1e3: return '%.1f%s' % (size, ' ') @@ -54,8 +76,8 @@ class GPU(object): self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate']) - self.nblocks = 64 - self.max_nthreads = 200000 + self.nthread_per_block = 64 + self.max_blocks = 1024 self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) self.daq_funcs = CUDAFuncs(self.daq_module, ['reset_earliest_time_int', 'run_daq', @@ -166,15 +188,16 @@ class GPU(object): solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32)) solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32)) - self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblocks,1,1), grid=(solid_ids_gpu.size//self.nblocks+1,1)) + self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblock,1,1), grid=(solid_ids_gpu.size//self.nthread_per_block+1,1)) self.context.synchronize() def setup_propagate(self, seed=1): - self.rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - self.prop_funcs.init_rng(np.int32(self.max_nthreads), self.rng_states_gpu, + self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks + * sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) + self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu, np.uint64(seed), np.uint64(0), - block=(self.nblocks,1,1), - grid=(self.max_nthreads//self.nblocks+1,1)) + block=(self.nthread_per_block,1,1), + grid=(self.max_blocks,1)) #self.context.synchronize() def load_photons(self, pos, dir, pol, wavelength, t0, @@ -193,7 +216,6 @@ class GPU(object): if any. -1 if no triangle was hit in the last step ''' self.nphotons = len(pos) - assert self.nphotons < self.max_nthreads assert len(dir) == self.nphotons assert len(pol) == self.nphotons assert len(wavelength) == self.nphotons @@ -204,6 +226,7 @@ class GPU(object): self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3)) self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32)) self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32)) + self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32) if histories is not None: self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32)) @@ -216,8 +239,6 @@ class GPU(object): self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32) self.last_hit_triangles_gpu.fill(-1) - self.per_photon_kernel_config = {'block': (self.nblocks,1,1), - 'grid': (self.nphotons//self.nblocks+1,1)} def propagate(self, max_steps=10): '''Propagate photons on GPU to termination or max_steps, whichever comes first. @@ -225,13 +246,19 @@ class GPU(object): May be called repeatedly without reloading photon information if single-stepping through photon history. ''' - self.prop_funcs.propagate(np.int32(self.nphotons), - self.rng_states_gpu, self.positions_gpu, - self.directions_gpu, - self.wavelengths_gpu, self.polarizations_gpu, - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - np.int32(max_steps), **self.per_photon_kernel_config) + for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): + self.prop_funcs.propagate(np.int32(first_photon), + np.int32(photons_this_round), + self.photon_offsets_gpu, + self.rng_states_gpu, self.positions_gpu, + self.directions_gpu, + self.wavelengths_gpu, self.polarizations_gpu, + self.times_gpu, self.histories_gpu, + self.last_hit_triangles_gpu, + np.int32(max_steps), + block=(self.nthread_per_block,1,1), + grid=(blocks, 1)) + #self.context.synchronize() def get_photons(self): @@ -257,24 +284,26 @@ class GPU(object): self.daq_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, - block=(self.nblocks,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1)) + block=(self.nthread_per_block,1,1), + grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1)) #self.context.synchronize() - self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), - np.float32(self.daq_pmt_rms), - np.int32(len(self.positions_gpu)), - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - self.solid_id_map_gpu, - np.int32(len(self.earliest_time_int_gpu)), - self.earliest_time_int_gpu, - block=(self.nblocks,1,1), grid=(self.nphotons//self.nblocks+1,1)) + for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): + self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), + np.float32(self.daq_pmt_rms), + np.int32(first_photon), + np.int32(photons_this_round), + self.times_gpu, self.histories_gpu, + self.last_hit_triangles_gpu, + self.solid_id_map_gpu, + np.int32(len(self.earliest_time_int_gpu)), + self.earliest_time_int_gpu, + block=(self.nthread_per_block,1,1), grid=(blocks,1)) #self.context.synchronize() self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, - block=(self.nblocks,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1)) + block=(self.nthread_per_block,1,1), + grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1)) #self.context.synchronize() @@ -30,6 +30,7 @@ __global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned __global__ void run_daq(curandState *s, unsigned int detection_state, float time_rms, + int first_photon, int nphotons, float *photon_times, unsigned int *photon_histories, int *last_hit_triangles, int *solid_map, @@ -41,17 +42,17 @@ __global__ void run_daq(curandState *s, unsigned int detection_state, if (id < nphotons) { curandState rng = s[id]; - - int triangle_id = last_hit_triangles[id]; + int photon_id = id + first_photon; + int triangle_id = last_hit_triangles[photon_id]; if (triangle_id > -1) { int solid_id = solid_map[triangle_id]; - int history = photon_histories[id]; + int history = photon_histories[photon_id]; if (solid_id < nsolids && (history & detection_state)) { - float time = photon_times[id] + curand_normal(&rng) * time_rms; + float time = photon_times[photon_id] + curand_normal(&rng) * time_rms; unsigned int time_int = float_to_sortable_int(time); atomicMin(earliest_time_int + solid_id, time_int); diff --git a/src/kernel.cu b/src/kernel.cu index 82efe88..17b829c 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -260,7 +260,9 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i } // ray_trace -__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps) +__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, + float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, + unsigned int *histories, int *last_hit_triangles, int max_steps) { int id = blockIdx.x*blockDim.x + threadIdx.x; @@ -269,16 +271,18 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio curandState rng = rng_states[id]; + int photon_id = photon_offsets[first_photon + id]; + Photon p; - p.position = positions[id]; - p.direction = directions[id]; + p.position = positions[photon_id]; + p.direction = directions[photon_id]; p.direction /= norm(p.direction); - p.polarization = polarizations[id]; + p.polarization = polarizations[photon_id]; p.polarization /= norm(p.polarization); - p.wavelength = wavelengths[id]; - p.time = times[id]; - p.last_hit_triangle = last_hit_triangles[id]; - p.history = histories[id]; + p.wavelength = wavelengths[photon_id]; + p.time = times[photon_id]; + p.last_hit_triangle = last_hit_triangles[photon_id]; + p.history = histories[photon_id]; if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) return; @@ -321,13 +325,13 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio } // while (steps < max_steps) rng_states[id] = rng; - positions[id] = p.position; - directions[id] = p.direction; - polarizations[id] = p.polarization; - wavelengths[id] = p.wavelength; - times[id] = p.time; - histories[id] = p.history; - last_hit_triangles[id] = p.last_hit_triangle; + positions[photon_id] = p.position; + directions[photon_id] = p.direction; + polarizations[photon_id] = p.polarization; + wavelengths[photon_id] = p.wavelength; + times[photon_id] = p.time; + histories[photon_id] = p.history; + last_hit_triangles[photon_id] = p.last_hit_triangle; } // propagate |