diff options
author | Anthony LaTorre <tlatorre9@gmail.com> | 2011-08-12 20:55:21 -0400 |
---|---|---|
committer | Anthony LaTorre <tlatorre9@gmail.com> | 2011-08-12 20:55:21 -0400 |
commit | 2610b77628c6c31feb167b40b6018fdfdb1846c3 (patch) | |
tree | a5c59c0cf324c41363913ccce4b543e65124d72c | |
parent | 272c555f6b893a05f6d6a6439d519036e9379075 (diff) | |
parent | 47738282858a7f30acef633ed0de2a6e933e9c7f (diff) | |
download | chroma-2610b77628c6c31feb167b40b6018fdfdb1846c3.tar.gz chroma-2610b77628c6c31feb167b40b6018fdfdb1846c3.tar.bz2 chroma-2610b77628c6c31feb167b40b6018fdfdb1846c3.zip |
merge
-rw-r--r-- | gpu.py | 76 | ||||
-rw-r--r-- | src/kernel.cu | 28 |
2 files changed, 86 insertions, 18 deletions
@@ -15,6 +15,26 @@ from color import map_to_color cuda.init() +def boolean_argsort(condition): + '''Returns two arrays of indicies indicating which elements of the + boolean array `condition` should be exchanged so that all of the + True entries occur before the false entries. + + Returns: tuple (a,b, ntrue) which give the indices for elements + to exchange in a and b, and the number of true entries in the array + in ntrue. This function in general requires fewer swaps than + numpy.argsort. + ''' + + true_indices = condition.nonzero()[0][::-1] # reverse + false_indices = (~condition).nonzero()[0] + + length = min(len(true_indices), len(false_indices)) + + cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True) + return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices)) + + def chunk_iterator(nelements, nthreads_per_block, max_blocks): '''Iterator that yields tuples with the values requried to process a long array in multiple kernel passes on the GPU. @@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks): first = 0 while first < nelements: elements_left = nelements - first - blocks = elements_left // nthreads_per_block + blocks = int(elements_left // nthreads_per_block) if elements_left % nthreads_per_block != 0: blocks += 1 #Round up only if needed blocks = min(max_blocks, blocks) @@ -80,7 +100,7 @@ class GPU(object): self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True) self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) - self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate']) + self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap']) self.nthread_per_block = 64 self.max_blocks = 1024 self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True) @@ -230,7 +250,6 @@ class GPU(object): self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3)) self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32)) self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32)) - self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32) if histories is not None: self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32)) @@ -250,18 +269,45 @@ class GPU(object): May be called repeatedly without reloading photon information if single-stepping through photon history. ''' - for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): - self.prop_funcs.propagate(np.int32(first_photon), - np.int32(photons_this_round), - self.photon_offsets_gpu, - self.rng_states_gpu, self.positions_gpu, - self.directions_gpu, - self.wavelengths_gpu, self.polarizations_gpu, - self.times_gpu, self.histories_gpu, - self.last_hit_triangles_gpu, - np.int32(max_steps), - block=(self.nthread_per_block,1,1), - grid=(blocks, 1)) + nphotons = self.nphotons + step = 0 + input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) + input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32) + input_queue_gpu = gpuarray.to_gpu(input_queue) + output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) + output_queue[0] = 1 + output_queue_gpu = gpuarray.to_gpu(output_queue) + + + while step < max_steps: + ## Just finish the rest of the steps if the # of photons is low + if nphotons < self.nthread_per_block * 16 * 8: + nsteps = max_steps - step + else: + nsteps = 1 + + for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks): + self.prop_funcs.propagate(np.int32(first_photon), + np.int32(photons_this_round), + input_queue_gpu[1:], + output_queue_gpu, + self.rng_states_gpu, self.positions_gpu, + self.directions_gpu, + self.wavelengths_gpu, self.polarizations_gpu, + self.times_gpu, self.histories_gpu, + self.last_hit_triangles_gpu, + np.int32(nsteps), + block=(self.nthread_per_block,1,1), + grid=(blocks, 1)) + + step += nsteps + + if step < max_steps: + temp = input_queue_gpu + input_queue_gpu = output_queue_gpu + output_queue_gpu = temp + output_queue_gpu[:1].set(np.uint32(1)) + nphotons = input_queue_gpu[:1].get()[0] if 'profile' in __builtins__: self.context.synchronize() diff --git a/src/kernel.cu b/src/kernel.cu index a26460a..cc567b8 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -292,7 +292,24 @@ __global__ void ray_trace_alpha(int nthreads, float3 *positions, float3 *directi } // ray_trace -__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states, +__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id < nswap) { + int a = offset_a[id]; + int b = offset_b[id]; + + int tmp = values[a]; + values[a] = values[b]; + values[b] = tmp; + } +} + +__global__ void propagate(int first_photon, int nthreads, + unsigned int *input_queue, + unsigned int *output_queue, + curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, unsigned int *histories, int *last_hit_triangles, int max_steps) { @@ -303,7 +320,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c curandState rng = rng_states[id]; - int photon_id = photon_offsets[first_photon + id]; + int photon_id = input_queue[first_photon + id]; Photon p; p.position = positions[photon_id]; @@ -364,7 +381,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c times[photon_id] = p.time; histories[photon_id] = p.history; last_hit_triangles[photon_id] = p.last_hit_triangle; - + + // Not done, put photon in output queue + if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) { + int out_idx = atomicAdd(output_queue, 1); + output_queue[out_idx] = photon_id; + } } // propagate } // extern "c" |