diff options
author | Stan Seibert <stan@mtrr.org> | 2011-09-10 15:05:49 -0400 |
---|---|---|
committer | Stan Seibert <stan@mtrr.org> | 2011-09-10 15:05:49 -0400 |
commit | 4c554d621d9b66c595397a0667d212bc4ea57be1 (patch) | |
tree | dc8b255220e05029799a2e6fc80a907fad7ee381 | |
parent | 655c354c9fc0dce41c158570ee7f1ec7df337922 (diff) | |
download | chroma-4c554d621d9b66c595397a0667d212bc4ea57be1.tar.gz chroma-4c554d621d9b66c595397a0667d212bc4ea57be1.tar.bz2 chroma-4c554d621d9b66c595397a0667d212bc4ea57be1.zip |
Add the ability to propagate the same photons multiple times on the
the GPU, and run the DAQ multiple times on the same photons in a
likelihood calculation.
Propagating the same photons in a warp speeds up propagation by a
factor of 3 (and we could do this even better if we wanted), and this
improves the statistics in a likelihood evaluation quite a bit.
Running the DAQ multiple times is also an inexpensive way to improve
the quality of the PDF estimates.
-rwxr-xr-x | benchmark.py | 23 | ||||
-rw-r--r-- | gpu.py | 121 | ||||
-rw-r--r-- | likelihood.py | 10 | ||||
-rwxr-xr-x | sim.py | 13 | ||||
-rw-r--r-- | src/propagate.cu | 38 |
5 files changed, 174 insertions, 31 deletions
diff --git a/benchmark.py b/benchmark.py index 329986c..9f90c7b 100755 --- a/benchmark.py +++ b/benchmark.py @@ -91,7 +91,7 @@ def propagate(gpu_geometry, number=10, nphotons=500000, nthreads_per_block=64, return nphotons/ufloat((np.mean(run_times),np.std(run_times))) @tools.profile_if_possible -def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, +def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=16, ndaq=1, nthreads_per_block=64, max_blocks=1024): """ Returns the average number of 100 MeV events per second that can be @@ -108,6 +108,9 @@ def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, The number of 100 MeV events to generate for each PDF. - nreps, int The number of times to propagate each event and add to PDF + - ndaq, int + The number of times to run the DAQ simulation on the propagated + event and add it to the PDF. """ rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks) @@ -125,13 +128,15 @@ def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, vertex_iter = itertools.islice(vertex_gen, nevents) for ev in g4generator.generate_events(vertex_iter): - for j in xrange(nreps): - gpu_photons = gpu.GPUPhotons(ev.photons_beg) - gpu_photons.propagate(gpu_geometry, rng_states, - nthreads_per_block, max_blocks) - gpu_channels = gpu_daq.acquire(gpu_photons, rng_states, - nthreads_per_block, max_blocks) - gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block) + gpu_photons = gpu.GPUPhotons(ev.photons_beg, ncopies=nreps) + + gpu_photons.propagate(gpu_geometry, rng_states, + nthreads_per_block, max_blocks) + for gpu_photon_slice in gpu_photons.iterate_copies(): + for idaq in xrange(ndaq): + gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, + nthreads_per_block, max_blocks) + gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block) hitcount, pdf = gpu_pdf.get_pdfs() @@ -141,7 +146,7 @@ def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1, # first kernel call incurs some driver overhead run_times.append(elapsed) - return nevents*nreps/ufloat((np.mean(run_times),np.std(run_times))) + return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times))) if __name__ == '__main__': from chroma import detectors @@ -4,6 +4,7 @@ from copy import copy from itertools import izip import os import sys +import gc import pytools import pycuda.tools @@ -13,7 +14,7 @@ import pycuda.driver as cuda from pycuda import gpuarray as ga import chroma.src -from chroma.tools import timeit +from chroma.tools import timeit, profile_if_possible from chroma.geometry import standard_wavelengths from chroma.color import map_to_color from chroma import event @@ -131,20 +132,63 @@ def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): first += elements_this_round class GPUPhotons(object): - def __init__(self, photons): - self.pos = ga.to_gpu(to_float3(photons.pos)) - self.dir = ga.to_gpu(to_float3(photons.dir)) - self.pol = ga.to_gpu(to_float3(photons.pol)) - self.wavelengths = ga.to_gpu(photons.wavelengths.astype(np.float32)) - self.t = ga.to_gpu(photons.t.astype(np.float32)) - self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32)) - self.flags = ga.to_gpu(photons.flags.astype(np.uint32)) + def __init__(self, photons, ncopies=1): + """Load ``photons`` onto the GPU, replicating as requested. + + Args: + - photons: chroma.Event.Photons + Photon state information to load onto GPU + - ncopies: int, *optional* + Number of times to replicate the photons + on the GPU. This is used if you want + to propagate the same event many times, + for example in a likelihood calculation. + + The amount of GPU storage will be proportionally + larger if ncopies > 1, so be careful. + """ + nphotons = len(photons) + self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) + self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) + self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) + self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) + self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) + self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) + self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) + + # Assign the provided photons to the beginning (possibly + # the entire array if ncopies is 1 + self.pos[:nphotons].set(to_float3(photons.pos)) + self.dir[:nphotons].set(to_float3(photons.dir)) + self.pol[:nphotons].set(to_float3(photons.pol)) + self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) + self.t[:nphotons].set(photons.t.astype(np.float32)) + self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) + self.flags[:nphotons].set(photons.flags.astype(np.uint32)) #cuda_options = ('--use_fast_math', '-w')#, '--ptxas-options=-v'] module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) + # Replicate the photons to the rest of the slots if needed + if ncopies > 1: + max_blocks = 1024 + nthreads_per_block = 64 + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), + self.pos, self.dir, self.wavelengths, self.pol, self.t, + self.flags, self.last_hit_triangles, + np.int32(ncopies-1), + np.int32(nphotons), + block=(nthreads_per_block,1,1), grid=(blocks, 1)) + + + # Save the duplication information for the iterate_copies() method + self.true_nphotons = nphotons + self.ncopies = ncopies + def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) @@ -155,6 +199,20 @@ class GPUPhotons(object): flags = self.flags.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags) + def iterate_copies(self): + '''Returns an iterator that yields GPUPhotonsSlice objects + corresponding to the event copies stored in ``self``.''' + for i in xrange(self.ncopies): + window = slice(self.true_nphotons*i, self.true_nphotons*(i+1)) + yield GPUPhotonsSlice(pos=self.pos[window], + dir=self.dir[window], + pol=self.pol[window], + wavelengths=self.wavelengths[window], + t=self.t[window], + last_hit_triangles=self.last_hit_triangles[window], + flags=self.flags[window]) + + @profile_if_possible def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10): """Propagate photons on GPU to termination or max_steps, whichever @@ -169,8 +227,11 @@ class GPUPhotons(object): """ nphotons = self.pos.size step = 0 - input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) - input_queue[1:] = np.arange(nphotons, dtype=np.uint32) + input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) + input_queue[0] = 0 + # Order photons initially in the queue to put the clones next to each other + for copy in xrange(self.ncopies): + input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) output_queue[0] = 1 @@ -199,6 +260,44 @@ class GPUPhotons(object): if ga.max(self.flags).get() & (1 << 31): print >>sys.stderr, "WARNING: ABORTED PHOTONS" + def __del__(self): + del self.pos + del self.dir + del self.pol + del self.wavelengths + del self.t + del self.flags + del self.last_hit_triangles + # Free up GPU memory quickly if now available + gc.collect() + +class GPUPhotonsSlice(GPUPhotons): + '''A `slice`-like view of a subrange of another GPU photons array. + Works exactly like an instance of GPUPhotons, but the GPU storage + is taken from another GPUPhotons instance. + + Returned by the GPUPhotons.iterate_copies() iterator.''' + def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles, + flags): + '''Create new object using slices of GPUArrays from an instance + of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!''' + self.pos = pos + self.dir = dir + self.pol = pol + self.wavelengths = wavelengths + self.t = t + self.last_hit_triangles = last_hit_triangles + self.flags = flags + + module = get_cu_module('propagate.cu', options=cuda_options) + self.gpu_funcs = GPUFuncs(module) + + self.true_nphotons = len(pos) + self.ncopies = 1 + + def __del__(self): + pass # Do nothing, because we don't own any of our GPU memory + class GPUChannels(object): def __init__(self, t, q, flags): self.t = t diff --git a/likelihood.py b/likelihood.py index 51d7ef9..08b090f 100644 --- a/likelihood.py +++ b/likelihood.py @@ -43,13 +43,14 @@ class Likelihood(object): self.event = event @profile_if_possible - def eval(self, vertex_generator, nevals, nreps=1): + def eval(self, vertex_generator, nevals, nreps=1, ndaq=1): """ Return the negative log likelihood that the detector event set in the constructor or by set_event() was the result of a particle generated by `vertex_generator`. If `nreps` set to > 1, each set of photon vertices will be propagated `nreps` times. """ + ntotal = nevals * nreps * ndaq vertex_generator = islice(vertex_generator, nevals) @@ -59,11 +60,12 @@ class Likelihood(object): 2.0e-9, self.trange, 1, self.qrange, nreps=nreps, + ndaq=ndaq, time_only=self.time_only) # Normalize probabilities and put a floor to keep the log finite - hit_prob = hitcount.astype(np.float32) / (nreps * nevals) - hit_prob = np.maximum(hit_prob, 0.5 / (nreps*nevals)) + hit_prob = hitcount.astype(np.float32) / ntotal + hit_prob = np.maximum(hit_prob, 0.5 / ntotal) # Set all zero or nan probabilities to limiting PDF value bad_value = (pdf_prob <= 0.0) | np.isnan(pdf_prob) @@ -79,7 +81,7 @@ class Likelihood(object): # NLL calculation: note that negation is at the end # Start with the probabilties of hitting (or not) the channels hit_channel_prob = np.log(hit_prob[self.event.channels.hit]).sum() + np.log(1.0-hit_prob[~self.event.channels.hit]).sum() - hit_channel_prob_uncert = ( (nevals * nreps * hit_prob * (1.0 - hit_prob)) / hit_prob**2 ).sum()**0.5 + hit_channel_prob_uncert = ( (ntotal * hit_prob * (1.0 - hit_prob)) / hit_prob**2 ).sum()**0.5 log_likelihood = ufloat((hit_channel_prob, 0.0)) # Then include the probability densities of the observed @@ -119,7 +119,7 @@ class Simulation(object): return self.gpu_pdf.get_pdfs() - def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=20, nreps=1, time_only=True): + def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=20, nreps=1, ndaq=1, time_only=True): """Returns tuple: 1D array of channel hit counts, 1D array of PDF probability densities.""" self.gpu_pdf.setup_pdf_eval(event_channels.hit, @@ -137,16 +137,15 @@ class Simulation(object): if isinstance(first_element, event.Event): iterable = self.photon_generator.generate_events(iterable) - if nreps > 1: - iterable = repeating_iterator(iterable, nreps) - for ev in iterable: - gpu_photons = gpu.GPUPhotons(ev.photons_beg) + gpu_photons = gpu.GPUPhotons(ev.photons_beg, ncopies=nreps) gpu_photons.propagate(self.gpu_geometry, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) - gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) - self.gpu_pdf.accumulate_pdf_eval(gpu_channels) + for gpu_photon_slice in gpu_photons.iterate_copies(): + for idaq in xrange(ndaq): + gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) + self.gpu_pdf.accumulate_pdf_eval(gpu_channels) return self.gpu_pdf.get_pdf_eval() diff --git a/src/propagate.cu b/src/propagate.cu index 2d5183e..ecc32c0 100644 --- a/src/propagate.cu +++ b/src/propagate.cu @@ -9,6 +9,44 @@ extern "C" { __global__ void +photon_duplicate(int first_photon, int nthreads, + float3 *positions, float3 *directions, + float *wavelengths, float3 *polarizations, + float *times, unsigned int *histories, + int *last_hit_triangles, + int copies, int stride) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + int photon_id = first_photon + id; + + Photon p; + p.position = positions[photon_id]; + p.direction = directions[photon_id]; + p.polarization = polarizations[photon_id]; + p.wavelength = wavelengths[photon_id]; + p.time = times[photon_id]; + p.last_hit_triangle = last_hit_triangles[photon_id]; + p.history = histories[photon_id]; + + for (int i=1; i <= copies; i++) { + int target_photon_id = photon_id + stride * i; + + positions[target_photon_id] = p.position; + directions[target_photon_id] = p.direction; + polarizations[target_photon_id] = p.polarization; + wavelengths[target_photon_id] = p.wavelength; + times[target_photon_id] = p.time; + last_hit_triangles[target_photon_id] = p.last_hit_triangle; + histories[target_photon_id] = p.history; + } +} + + +__global__ void propagate(int first_photon, int nthreads, unsigned int *input_queue, unsigned int *output_queue, curandState *rng_states, float3 *positions, float3 *directions, |