diff options
| -rw-r--r-- | gpu.py | 76 | ||||
| -rw-r--r-- | src/kernel.cu | 28 | 
2 files changed, 86 insertions, 18 deletions
@@ -15,6 +15,26 @@ from color import map_to_color  cuda.init() +def boolean_argsort(condition): +    '''Returns two arrays of indicies indicating which elements of the +    boolean array `condition` should be exchanged so that all of the +    True entries occur before the false entries. + +       Returns: tuple (a,b, ntrue) which give the indices for elements +       to exchange in a and b, and the number of true entries in the array +       in ntrue.  This function in general requires fewer swaps than  +       numpy.argsort. +    ''' + +    true_indices = condition.nonzero()[0][::-1] # reverse +    false_indices = (~condition).nonzero()[0] +     +    length = min(len(true_indices), len(false_indices)) +     +    cut_index = np.searchsorted((true_indices[:length] - false_indices[:length]) <= 0, True) +    return (true_indices[:cut_index], false_indices[:cut_index], len(true_indices)) + +  def chunk_iterator(nelements, nthreads_per_block, max_blocks):      '''Iterator that yields tuples with the values requried to process      a long array in multiple kernel passes on the GPU. @@ -27,7 +47,7 @@ def chunk_iterator(nelements, nthreads_per_block, max_blocks):      first = 0      while first < nelements:          elements_left = nelements - first -        blocks = elements_left // nthreads_per_block +        blocks = int(elements_left // nthreads_per_block)          if elements_left % nthreads_per_block != 0:              blocks += 1 #Round up only if needed          blocks = min(max_blocks, blocks) @@ -80,7 +100,7 @@ class GPU(object):          self.module = SourceModule(src.kernel, options=cuda_options, no_extern_c=True)          self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids']) -        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate']) +        self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])          self.nthread_per_block = 64          self.max_blocks = 1024          self.daq_module = SourceModule(src.daq, options=cuda_options, no_extern_c=True) @@ -230,7 +250,6 @@ class GPU(object):          self.polarizations_gpu = gpuarray.to_gpu(pol.astype(np.float32).view(gpuarray.vec.float3))          self.wavelengths_gpu = gpuarray.to_gpu(wavelength.astype(np.float32))          self.times_gpu = gpuarray.to_gpu(t0.astype(np.float32)) -        self.photon_offsets_gpu = gpuarray.arange(self.nphotons, dtype=np.int32)          if histories is not None:              self.histories_gpu = gpuarray.to_gpu(histories.astype(np.uint32)) @@ -250,18 +269,45 @@ class GPU(object):          May be called repeatedly without reloading photon information if single-stepping          through photon history.          ''' -        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks): -            self.prop_funcs.propagate(np.int32(first_photon), -                                      np.int32(photons_this_round), -                                      self.photon_offsets_gpu, -                                      self.rng_states_gpu, self.positions_gpu,  -                                      self.directions_gpu,  -                                      self.wavelengths_gpu, self.polarizations_gpu,  -                                      self.times_gpu, self.histories_gpu,  -                                      self.last_hit_triangles_gpu,  -                                      np.int32(max_steps),  -                                      block=(self.nthread_per_block,1,1), -                                      grid=(blocks, 1)) +        nphotons = self.nphotons +        step = 0 +        input_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) +        input_queue[1:] = np.arange(self.nphotons, dtype=np.uint32) +        input_queue_gpu = gpuarray.to_gpu(input_queue) +        output_queue = np.zeros(shape=self.nphotons+1, dtype=np.uint32) +        output_queue[0] = 1 +        output_queue_gpu = gpuarray.to_gpu(output_queue) + + +        while step < max_steps: +            ## Just finish the rest of the steps if the # of photons is low +            if nphotons < self.nthread_per_block * 16 * 8: +                nsteps = max_steps - step +            else: +                nsteps = 1 + +            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks): +                self.prop_funcs.propagate(np.int32(first_photon), +                                          np.int32(photons_this_round), +                                          input_queue_gpu[1:], +                                          output_queue_gpu, +                                          self.rng_states_gpu, self.positions_gpu,  +                                          self.directions_gpu,  +                                          self.wavelengths_gpu, self.polarizations_gpu,  +                                          self.times_gpu, self.histories_gpu,  +                                          self.last_hit_triangles_gpu,  +                                          np.int32(nsteps),  +                                          block=(self.nthread_per_block,1,1), +                                          grid=(blocks, 1)) +         +            step += nsteps + +            if step < max_steps: +                temp = input_queue_gpu +                input_queue_gpu = output_queue_gpu +                output_queue_gpu = temp +                output_queue_gpu[:1].set(np.uint32(1)) +                nphotons = input_queue_gpu[:1].get()[0]          if 'profile' in __builtins__:              self.context.synchronize() diff --git a/src/kernel.cu b/src/kernel.cu index a26460a..cc567b8 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -292,7 +292,24 @@ __global__ void ray_trace_alpha(int nthreads, float3 *positions, float3 *directi  } // ray_trace -__global__ void propagate(int first_photon, int nthreads, int *photon_offsets, curandState *rng_states,  +__global__ void swap(int *values, int nswap, int *offset_a, int *offset_b) +{ +  int id = blockIdx.x*blockDim.x + threadIdx.x; + +  if (id < nswap) { +    int a = offset_a[id]; +    int b = offset_b[id]; + +    int tmp = values[a]; +    values[a] = values[b]; +    values[b] = tmp; +  } +} + +__global__ void propagate(int first_photon, int nthreads,  +			  unsigned int *input_queue,  +			  unsigned int *output_queue, +			  curandState *rng_states,   			  float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times,   			  unsigned int *histories, int *last_hit_triangles, int max_steps)  { @@ -303,7 +320,7 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c  	curandState rng = rng_states[id]; -	int photon_id = photon_offsets[first_photon + id]; +	int photon_id = input_queue[first_photon + id];  	Photon p;  	p.position = positions[photon_id]; @@ -364,7 +381,12 @@ __global__ void propagate(int first_photon, int nthreads, int *photon_offsets, c  	times[photon_id] = p.time;  	histories[photon_id] = p.history;  	last_hit_triangles[photon_id] = p.last_hit_triangle; - +	 +	// Not done, put photon in output queue +	if ( (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB)) == 0) { +	  int out_idx = atomicAdd(output_queue, 1); +	  output_queue[out_idx] = photon_id; +	}  } // propagate  } // extern "c"  | 
