diff options
-rw-r--r-- | gpu.py | 89 | ||||
-rw-r--r-- | src/daq.cu | 9 | ||||
-rw-r--r-- | src/kernel.cu | 34 |
3 files changed, 83 insertions, 49 deletions
@@ -15,6 +15,28 @@ from color import map_to_color cuda.init() +def chunk_iterator(nelements, nthreads_per_block, max_blocks): + '''Iterator that yields tuples with the values requried to process + a long array in multiple kernel passes on the GPU. + + Each yielded value is (first_index, elements_this_iteration, nblocks_this_iteration) + + >>> list(chunk_iterator(300, 32, 2)) + [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] + ''' + first = 0 + while first < nelements: + elements_left = nelements - first + blocks = elements_left // nthreads_per_block + if elements_left % nthreads_per_block != 0: + blocks += 1 #Round up only if needed + blocks = min(max_blocks, blocks) + elements_this_round = min(elements_left, blocks * nthreads_per_block) + + yield (first, elements_this_round, blocks) + first += elements_this_round + + def format_size(size): if size < 1e3: return '%.1f%s' % (size, ' ') @@ -54,8 +76,8 @@ class GPU(object): self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate']) - self.nblocks = 64 - self.max_nthreads = 200000 + self.nthread_per_block = 64 + self.max_blocks = 1024 self.daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) self.daq_funcs = CUDAFuncs(self.daq_module, ['reset_earliest_time_int', 'run_daq', @@ -166,15 +188,16 @@ class GPU(object): solid_ids_gpu = gpuarray.to_gpu(np.array(solid_ids, dtype=np.int32)) solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32)) - self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblocks,1,1), grid=(solid_ids_gpu.size//self.nblocks+1,1)) + self.geo_funcs.color_solids(np.int32(solid_ids_gpu.size), np.uint32(self.triangles_gpu.size), self.solid_id_map_gpu, solid_ids_gpu, solid_colors_gpu, block=(self.nblock,1,1), grid=(solid_ids_gpu.size//self.nthread_per_block+1,1)) self.context.synchronize() def setup_propagate(self, seed=1): - self.rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) - self.prop_funcs.init_rng(np.int32(self.max_nthreads), self.rng_states_gpu, + self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks + * sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) + self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu, np.uint64(seed), np.uint64(0), - block=(self.nblocks,1,1), - grid=(self.max_nthreads//self.nblocks+1,1)) + block=(self.nthread_per_block,1,1), + grid=(self.max_blocks,1)) #self.context.synchronize() def load_photons(self, pos, dir, pol, wavelength, t0, @@ -193,7 +216,6 @@ class GPU(object): if any. -1 if no triangle was hit in the last step ''' self.nphotons = len(pos) - assert self.nphotons < self.max_nthreads assert len(dir) == self.nphotons assert len(pol) == self.nphotons assert len(wavelength) == self.nphotons @@ -204,6 +226,7 @@ class GPU(object): self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3)) self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32)) self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32)) + self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32) if histories is not None: self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32)) @@ -216,8 +239,6 @@ class GPU(object): self.last_hit_triangles_gpu = gpuarray.empty(self.nphotons, dtype=np.int32) self.last_hit_triangles_gpu.fill(-1) - self.per_photon_kernel_config = {'block': (self.nblocks,1,1), - 'grid': (self.nphotons//self.nblocks+1,1)} def propagate(self, max_steps=10): '''Propagate photons on GPU to termination or max_steps, whichever comes first. @@ -225,13 +246,19 @@ class GPU(object): May be called repeatedly without reloading photon information if single-stepping through photon history. ''' - self.prop_funcs.propagate(np.int32(self.nphotons), - self.rng_states_gpu, self.positions_gpu, - self.directions_gpu, - self.wavelengths_gpu, self.polarizations_gpu, - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - np.int32(max_steps), **self.per_photon_kernel_config) + for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): + self.prop_funcs.propagate(np.int32(first_photon), + np.int32(photons_this_round), + self.photon_offsets_gpu, + self.rng_states_gpu, self.positions_gpu, + self.directions_gpu, + self.wavelengths_gpu, self.polarizations_gpu, + self.times_gpu, self.histories_gpu, + self.last_hit_triangles_gpu, + np.int32(max_steps), + block=(self.nthread_per_block,1,1), + grid=(blocks, 1)) + #self.context.synchronize() def get_photons(self): @@ -257,24 +284,26 @@ class GPU(object): self.daq_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, - block=(self.nblocks,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1)) + block=(self.nthread_per_block,1,1), + grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1)) #self.context.synchronize() - self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), - np.float32(self.daq_pmt_rms), - np.int32(len(self.positions_gpu)), - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - self.solid_id_map_gpu, - np.int32(len(self.earliest_time_int_gpu)), - self.earliest_time_int_gpu, - block=(self.nblocks,1,1), grid=(self.nphotons//self.nblocks+1,1)) + for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): + self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), + np.float32(self.daq_pmt_rms), + np.int32(first_photon), + np.int32(photons_this_round), + self.times_gpu, self.histories_gpu, + self.last_hit_triangles_gpu, + self.solid_id_map_gpu, + np.int32(len(self.earliest_time_int_gpu)), + self.earliest_time_int_gpu, + block=(self.nthread_per_block,1,1), grid=(blocks,1)) #self.context.synchronize() self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, - block=(self.nblocks,1,1), - grid=(len(self.earliest_time_int_gpu)//self.nblocks+1,1)) + block=(self.nthread_per_block,1,1), + grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1)) #self.context.synchronize() @@ -30,6 +30,7 @@ __global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned __global__ void run_daq(curandState *s, unsigned int detection_state, float time_rms, + int first_photon, int nphotons, float *photon_times, unsigned int *photon_histories, int *last_hit_triangles, int *solid_map, @@ -41,17 +42,17 @@ __global__ void run_daq(curandState *s, unsigned int detection_state, if (id < nphotons) { curandState rng = s[id]; - - int triangle_id = last_hit_triangles[id]; + int photon_id = id + first_photon; + int triangle_id = last_hit_triangles[photon_id]; if (triangle_id > -1) { int solid_id = solid_map[triangle_id]; - int history = photon_histories[id]; + int history = photon_histories[photon_id]; if (solid_id < nsolids && (history & detection_state)) { - float time = photon_times[id] + curand_normal(&rng) * time_rms; + float time = photon_times[photon_id] + curand_normal(&rng) * time_rms; unsigned int time_int = float_to_sortable_int(time); atomicMin(earliest_time_int + solid_id, time_int); diff --git a/src/kernel.cu b/src/kernel.cu index 82efe88..17b829c 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -260,7 +260,9 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i } // ray_trace -__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps) +__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, + float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, + unsigned int *histories, int *last_hit_triangles, int max_steps) { int id = blockIdx.x*blockDim.x + threadIdx.x; @@ -269,16 +271,18 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio curandState rng = rng_states[id]; + int photon_id = photon_offsets[first_photon + id]; + Photon p; - p.position = positions[id]; - p.direction = directions[id]; + p.position = positions[photon_id]; + p.direction = directions[photon_id]; p.direction /= norm(p.direction); - p.polarization = polarizations[id]; + p.polarization = polarizations[photon_id]; p.polarization /= norm(p.polarization); - p.wavelength = wavelengths[id]; - p.time = times[id]; - p.last_hit_triangle = last_hit_triangles[id]; - p.history = histories[id]; + p.wavelength = wavelengths[photon_id]; + p.time = times[photon_id]; + p.last_hit_triangle = last_hit_triangles[photon_id]; + p.history = histories[photon_id]; if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) return; @@ -321,13 +325,13 @@ __global__ void propagate(int nthreads, curandState *rng_states, float3 *positio } // while (steps < max_steps) rng_states[id] = rng; - positions[id] = p.position; - directions[id] = p.direction; - polarizations[id] = p.polarization; - wavelengths[id] = p.wavelength; - times[id] = p.time; - histories[id] = p.history; - last_hit_triangles[id] = p.last_hit_triangle; + positions[photon_id] = p.position; + directions[photon_id] = p.direction; + polarizations[photon_id] = p.polarization; + wavelengths[photon_id] = p.wavelength; + times[photon_id] = p.time; + histories[photon_id] = p.history; + last_hit_triangles[photon_id] = p.last_hit_triangle; } // propagate |