diff options
Diffstat (limited to 'gpu.py')
-rw-r--r-- | gpu.py | 76 |
1 files changed, 61 insertions, 15 deletions
@@ -15,6 +15,26 @@ from color import map_to_color cuda.init() +def boolean_argsort(condition): + '''Returns two arrays of indicies indicating which elements of the + boolean array `condition` should be exchanged so that all of the + True entries occur before the false entries. + + Returns: tuple (a,b, ntrue) which give the indices for elements + to exchange in a and b, and the number of true entries in the array + in ntrue. This function in general requires fewer swaps than + numpy.argsort. + ''' + + true_indices = condition.nonzero()[0][::-1] # reverse + false_indices = (~condition).nonzero()[0] + + length = min(len(true_indices), len(false_indices)) + + cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True) + return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices)) + + def chunk_iterator(nelements, nthreads_per_block, max_blocks): '''Iterator that yields tuples with the values requried to process a long array in multiple kernel passes on the GPU. @@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks): first = 0 while first < nelements: elements_left = nelements - first - blocks = elements_left // nthreads_per_block + blocks = int(elements_left // nthreads_per_block) if elements_left % nthreads_per_block != 0: blocks += 1 #Round up only if needed blocks = min(max_blocks, blocks) @@ -80,7 +100,7 @@ class GPU(object): self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True) self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) - self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate']) + self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap']) self.nthread_per_block = 64 self.max_blocks = 1024 self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True) @@ -230,7 +250,6 @@ class GPU(object): self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3)) self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32)) self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32)) - self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32) if histories is not None: self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32)) @@ -250,18 +269,45 @@ class GPU(object): May be called repeatedly without reloading photon information if single-stepping through photon history. ''' - for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): - self.prop_funcs.propagate(np.int32(first_photon), - np.int32(photons_this_round), - self.photon_offsets_gpu, - self.rng_states_gpu, self.positions_gpu, - self.directions_gpu, - self.wavelengths_gpu, self.polarizations_gpu, - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - np.int32(max_steps), - block=(self.nthread_per_block,1,1), - grid=(blocks, 1)) + nphotons = self.nphotons + step = 0 + input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) + input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32) + input_queue_gpu = gpuarray.to_gpu(input_queue) + output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) + output_queue[0] = 1 + output_queue_gpu = gpuarray.to_gpu(output_queue) + + + while step < max_steps: + ## Just finish the rest of the steps if the # of photons is low + if nphotons < self.nthread_per_block * 16 * 8: + nsteps = max_steps - step + else: + nsteps = 1 + + for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks): + self.prop_funcs.propagate(np.int32(first_photon), + np.int32(photons_this_round), + input_queue_gpu[1:], + output_queue_gpu, + self.rng_states_gpu, self.positions_gpu, + self.directions_gpu, + self.wavelengths_gpu, self.polarizations_gpu, + self.times_gpu, self.histories_gpu, + self.last_hit_triangles_gpu, + np.int32(nsteps), + block=(self.nthread_per_block,1,1), + grid=(blocks, 1)) + + step += nsteps + + if step < max_steps: + temp = input_queue_gpu + input_queue_gpu = output_queue_gpu + output_queue_gpu = temp + output_queue_gpu[:1].set(np.uint32(1)) + nphotons = input_queue_gpu[:1].get()[0] if 'profile' in __builtins__: self.context.synchronize() |