diff options
-rw-r--r-- | chroma/cuda/propagate.cu | 62 | ||||
-rw-r--r-- | chroma/gpu/photon.py | 56 |
2 files changed, 117 insertions, 1 deletions
diff --git a/chroma/cuda/propagate.cu b/chroma/cuda/propagate.cu index fdcf532..87ba9f5 100644 --- a/chroma/cuda/propagate.cu +++ b/chroma/cuda/propagate.cu @@ -45,7 +45,67 @@ photon_duplicate(int first_photon, int nthreads, histories[target_photon_id] = p.history; } } - + +__global__ void +count_photons(int first_photon, int nthreads, unsigned int target_flag, + unsigned int *index_counter, + unsigned int *histories) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + __shared__ unsigned int counter; + + if (threadIdx.x == 0) + counter = 0; + __syncthreads(); + + if (id < nthreads) { + int photon_id = first_photon + id; + + if (histories[photon_id] & target_flag) { + atomicAdd(&counter, 1); + } + + } + + __syncthreads(); + + if (threadIdx.x == 0) + atomicAdd(index_counter, counter); +} + +__global__ void +copy_photons(int first_photon, int nthreads, unsigned int target_flag, + unsigned int *index_counter, + float3 *positions, float3 *directions, + float *wavelengths, float3 *polarizations, + float *times, unsigned int *histories, + int *last_hit_triangles, + float3 *new_positions, float3 *new_directions, + float *new_wavelengths, float3 *new_polarizations, + float *new_times, unsigned int *new_histories, + int *new_last_hit_triangles) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + int photon_id = first_photon + id; + + if (histories[photon_id] & target_flag) { + int offset = atomicAdd(index_counter, 1); + + new_positions[offset] = positions[photon_id]; + new_directions[offset] = directions[photon_id]; + new_polarizations[offset] = polarizations[photon_id]; + new_wavelengths[offset] = wavelengths[photon_id]; + new_times[offset] = times[photon_id]; + new_histories[offset] = histories[photon_id]; + new_last_hit_triangles[offset] = last_hit_triangles[photon_id]; + } +} + + __global__ void propagate(int first_photon, int nthreads, unsigned int *input_queue, unsigned int *output_queue, curandState *rng_states, diff --git a/chroma/gpu/photon.py b/chroma/gpu/photon.py index 2a9a1ff..f55b8e8 100644 --- a/chroma/gpu/photon.py +++ b/chroma/gpu/photon.py @@ -2,6 +2,7 @@ import numpy as np import sys import gc from pycuda import gpuarray as ga +import pycuda.driver as cuda from chroma.tools import profile_if_possible from chroma import event @@ -135,6 +136,57 @@ class GPUPhotons(object): if ga.max(self.flags).get() & (1 << 31): print >>sys.stderr, "WARNING: ABORTED PHOTONS" + cuda.Context.get_current().synchronize() + + + @profile_if_possible + def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, + start_photon=None, nphotons=None): + '''Return a new GPUPhoton object containing only photons that + have a particular bit set in their history word.''' + cuda.Context.get_current().synchronize() + index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) + cuda.Context.get_current().synchronize() + if start_photon is None: + start_photon = 0 + if nphotons is None: + nphotons = self.pos.size - start_photon + + # First count how much space we need + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), + np.int32(photons_this_round), + np.uint32(target_flag), + index_counter_gpu, self.flags, + block=(nthreads_per_block,1,1), + grid=(blocks, 1)) + cuda.Context.get_current().synchronize() + reduced_nphotons = int(index_counter_gpu.get()[0]) + # Then allocate new storage space + pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) + dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) + pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) + wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) + t = ga.empty(shape=reduced_nphotons, dtype=np.float32) + last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) + flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) + + # And finaly copy photons, if there are any + if reduced_nphotons > 0: + index_counter_gpu.fill(0) + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), + np.int32(photons_this_round), + np.uint32(target_flag), + index_counter_gpu, + self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, + pos, dir, wavelengths, pol, t, flags, last_hit_triangles, + block=(nthreads_per_block,1,1), + grid=(blocks, 1)) + assert index_counter_gpu.get()[0] == reduced_nphotons + return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) def __del__(self): del self.pos @@ -147,6 +199,10 @@ class GPUPhotons(object): # Free up GPU memory quickly if now available gc.collect() + + def __len__(self): + return self.pos.size + class GPUPhotonsSlice(GPUPhotons): '''A `slice`-like view of a subrange of another GPU photons array. Works exactly like an instance of GPUPhotons, but the GPU storage |