5 files changed, 174 insertions, 31 deletions
diff --git a/benchmark.py b/benchmark.py
index 329986c..9f90c7b 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -91,7 +91,7 @@ def propagate(gpu_geometry, number=10, nphotons=500000, nthreads_per_block=64,
     return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
 
 @tools.profile_if_possible
-def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
+def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=16, ndaq=1,
         nthreads_per_block=64, max_blocks=1024):
     """
     Returns the average number of 100 MeV events per second that can be
@@ -108,6 +108,9 @@ def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
             The number of 100 MeV events to generate for each PDF.
         - nreps, int
             The number of times to propagate each event and add to PDF
+        - ndaq, int
+            The number of times to run the DAQ simulation on the propagated
+            event and add it to the PDF.
     """
     rng_states = gpu.get_rng_states(nthreads_per_block*max_blocks)
 
@@ -125,13 +128,15 @@ def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
         vertex_iter = itertools.islice(vertex_gen, nevents)
 
         for ev in g4generator.generate_events(vertex_iter):
-            for j in xrange(nreps):
-                gpu_photons = gpu.GPUPhotons(ev.photons_beg)
-                gpu_photons.propagate(gpu_geometry, rng_states,
-                                      nthreads_per_block, max_blocks)
-                gpu_channels = gpu_daq.acquire(gpu_photons, rng_states,
-                                               nthreads_per_block, max_blocks)
-                gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg, ncopies=nreps)
+
+            gpu_photons.propagate(gpu_geometry, rng_states,
+                                  nthreads_per_block, max_blocks)
+            for gpu_photon_slice in gpu_photons.iterate_copies():
+                for idaq in xrange(ndaq):
+                    gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
+                                                   nthreads_per_block, max_blocks)
+                    gpu_pdf.add_hits_to_pdf(gpu_channels, nthreads_per_block)
 
         hitcount, pdf = gpu_pdf.get_pdfs()
 
@@ -141,7 +146,7 @@ def pdf(gpu_geometry, max_pmt_id, npdfs=10, nevents=100, nreps=1,
             # first kernel call incurs some driver overhead
             run_times.append(elapsed)
 
-    return nevents*nreps/ufloat((np.mean(run_times),np.std(run_times)))
+    return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times)))
 
 if __name__ == '__main__':
     from chroma import detectors
diff --git a/gpu.py b/gpu.py
index 671c872..a82453e 100644
--- a/gpu.py
+++ b/gpu.py
@@ -4,6 +4,7 @@ from copy import copy
 from itertools import izip
 import os
 import sys
+import gc
 
 import pytools
 import pycuda.tools
@@ -13,7 +14,7 @@ import pycuda.driver as cuda
 from pycuda import gpuarray as ga
 
 import chroma.src
-from chroma.tools import timeit
+from chroma.tools import timeit, profile_if_possible
 from chroma.geometry import standard_wavelengths
 from chroma.color import map_to_color
 from chroma import event
@@ -131,20 +132,63 @@ def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024):
         first += elements_this_round
 
 class GPUPhotons(object):
-    def __init__(self, photons):
-        self.pos = ga.to_gpu(to_float3(photons.pos))
-        self.dir = ga.to_gpu(to_float3(photons.dir))
-        self.pol = ga.to_gpu(to_float3(photons.pol))
-        self.wavelengths = ga.to_gpu(photons.wavelengths.astype(np.float32))
-        self.t = ga.to_gpu(photons.t.astype(np.float32))
-        self.last_hit_triangles = ga.to_gpu(photons.last_hit_triangles.astype(np.int32))
-        self.flags = ga.to_gpu(photons.flags.astype(np.uint32))
+    def __init__(self, photons, ncopies=1):
+        """Load ``photons`` onto the GPU, replicating as requested.
+
+           Args:
+               - photons: chroma.Event.Photons
+                   Photon state information to load onto GPU
+               - ncopies: int, *optional*
+                   Number of times to replicate the photons
+                   on the GPU.  This is used if you want
+                   to propagate the same event many times,
+                   for example in a likelihood calculation.
+
+                   The amount of GPU storage will be proportionally
+                   larger if ncopies > 1, so be careful.
+        """
+        nphotons = len(photons)
+        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
+        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
+        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
+        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
+        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
+
+        # Assign the provided photons to the beginning (possibly
+        # the entire array if ncopies is 1
+        self.pos[:nphotons].set(to_float3(photons.pos))
+        self.dir[:nphotons].set(to_float3(photons.dir))
+        self.pol[:nphotons].set(to_float3(photons.pol))
+        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
+        self.t[:nphotons].set(photons.t.astype(np.float32))
+        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
+        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
 
         #cuda_options = ('--use_fast_math', '-w')#, '--ptxas-options=-v']
 
         module = get_cu_module('propagate.cu', options=cuda_options)
         self.gpu_funcs = GPUFuncs(module)
 
+        # Replicate the photons to the rest of the slots if needed
+        if ncopies > 1:
+            max_blocks = 1024
+            nthreads_per_block = 64
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
+                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
+                                                self.flags, self.last_hit_triangles, 
+                                                np.int32(ncopies-1), 
+                                                np.int32(nphotons),
+                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))
+
+
+        # Save the duplication information for the iterate_copies() method
+        self.true_nphotons = nphotons
+        self.ncopies = ncopies
+
     def get(self):
         pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
         dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
@@ -155,6 +199,20 @@ class GPUPhotons(object):
         flags = self.flags.get()
         return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags)
 
+    def iterate_copies(self):
+        '''Returns an iterator that yields GPUPhotonsSlice objects
+        corresponding to the event copies stored in ``self``.'''
+        for i in xrange(self.ncopies):
+            window = slice(self.true_nphotons*i, self.true_nphotons*(i+1))
+            yield GPUPhotonsSlice(pos=self.pos[window],
+                                  dir=self.dir[window],
+                                  pol=self.pol[window],
+                                  wavelengths=self.wavelengths[window],
+                                  t=self.t[window],
+                                  last_hit_triangles=self.last_hit_triangles[window],
+                                  flags=self.flags[window])
+
+    @profile_if_possible
     def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
                   max_blocks=1024, max_steps=10):
         """Propagate photons on GPU to termination or max_steps, whichever
@@ -169,8 +227,11 @@ class GPUPhotons(object):
         """
         nphotons = self.pos.size
         step = 0
-        input_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
-        input_queue[1:] = np.arange(nphotons, dtype=np.uint32)
+        input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
+        input_queue[0] = 0
+        # Order photons initially in the queue to put the clones next to each other
+        for copy in xrange(self.ncopies):
+            input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
         input_queue_gpu = ga.to_gpu(input_queue)
         output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
         output_queue[0] = 1
@@ -199,6 +260,44 @@ class GPUPhotons(object):
         if ga.max(self.flags).get() & (1 << 31):
             print >>sys.stderr, "WARNING: ABORTED PHOTONS"
 
+    def __del__(self):
+        del self.pos
+        del self.dir
+        del self.pol
+        del self.wavelengths
+        del self.t
+        del self.flags
+        del self.last_hit_triangles
+        # Free up GPU memory quickly if now available
+        gc.collect()
+
+class GPUPhotonsSlice(GPUPhotons):
+    '''A `slice`-like view of a subrange of another GPU photons array.
+    Works exactly like an instance of GPUPhotons, but the GPU storage
+    is taken from another GPUPhotons instance.
+
+    Returned by the GPUPhotons.iterate_copies() iterator.'''
+    def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
+                 flags):
+        '''Create new object using slices of GPUArrays from an instance
+        of GPUPhotons.  NOTE THESE ARE NOT CPU ARRAYS!'''
+        self.pos = pos
+        self.dir = dir
+        self.pol = pol
+        self.wavelengths = wavelengths
+        self.t = t
+        self.last_hit_triangles = last_hit_triangles
+        self.flags = flags
+
+        module = get_cu_module('propagate.cu', options=cuda_options)
+        self.gpu_funcs = GPUFuncs(module)
+
+        self.true_nphotons = len(pos)
+        self.ncopies = 1
+
+    def __del__(self):
+        pass # Do nothing, because we don't own any of our GPU memory
+
 class GPUChannels(object):
     def __init__(self, t, q, flags):
         self.t = t
diff --git a/likelihood.py b/likelihood.py
index 51d7ef9..08b090f 100644
--- a/likelihood.py
+++ b/likelihood.py
@@ -43,13 +43,14 @@ class Likelihood(object):
         self.event = event
 
     @profile_if_possible
-    def eval(self, vertex_generator, nevals, nreps=1):
+    def eval(self, vertex_generator, nevals, nreps=1, ndaq=1):
         """
         Return the negative log likelihood that the detector event set in the
         constructor or by set_event() was the result of a particle generated
         by `vertex_generator`.  If `nreps` set to > 1, each set of photon
         vertices will be propagated `nreps` times.
         """
+        ntotal = nevals * nreps * ndaq
 
         vertex_generator = islice(vertex_generator, nevals)
 
@@ -59,11 +60,12 @@ class Likelihood(object):
                               2.0e-9, self.trange, 
                               1, self.qrange,
                               nreps=nreps,
+                              ndaq=ndaq,
                               time_only=self.time_only)
         
         # Normalize probabilities and put a floor to keep the log finite
-        hit_prob = hitcount.astype(np.float32) / (nreps * nevals)
-        hit_prob = np.maximum(hit_prob, 0.5 / (nreps*nevals))
+        hit_prob = hitcount.astype(np.float32) / ntotal
+        hit_prob = np.maximum(hit_prob, 0.5 / ntotal)
 
         # Set all zero or nan probabilities to limiting PDF value
         bad_value = (pdf_prob <= 0.0) | np.isnan(pdf_prob)
@@ -79,7 +81,7 @@ class Likelihood(object):
         # NLL calculation: note that negation is at the end
         # Start with the probabilties of hitting (or not) the channels
         hit_channel_prob = np.log(hit_prob[self.event.channels.hit]).sum() + np.log(1.0-hit_prob[~self.event.channels.hit]).sum()
-        hit_channel_prob_uncert = ( (nevals * nreps * hit_prob * (1.0 - hit_prob)) / hit_prob**2 ).sum()**0.5
+        hit_channel_prob_uncert = ( (ntotal * hit_prob * (1.0 - hit_prob)) / hit_prob**2 ).sum()**0.5
         log_likelihood = ufloat((hit_channel_prob, 0.0))
 
         # Then include the probability densities of the observed
diff --git a/sim.py b/sim.py
index 7ba6472..7c9720a 100755
--- a/sim.py
+++ b/sim.py
@@ -119,7 +119,7 @@ class Simulation(object):
         
         return self.gpu_pdf.get_pdfs()
 
-    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=20, nreps=1, time_only=True):
+    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=20, nreps=1, ndaq=1, time_only=True):
         """Returns tuple: 1D array of channel hit counts, 1D array of PDF
         probability densities."""
         self.gpu_pdf.setup_pdf_eval(event_channels.hit,
@@ -137,16 +137,15 @@ class Simulation(object):
         if isinstance(first_element, event.Event):
             iterable = self.photon_generator.generate_events(iterable)
 
-        if nreps > 1:
-            iterable = repeating_iterator(iterable, nreps)
-
         for ev in iterable:
-            gpu_photons = gpu.GPUPhotons(ev.photons_beg)
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg, ncopies=nreps)
             gpu_photons.propagate(self.gpu_geometry, self.rng_states,
                                   nthreads_per_block=self.nthreads_per_block,
                                   max_blocks=self.max_blocks)
-            gpu_channels = self.gpu_daq.acquire(gpu_photons, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
-            self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
+            for gpu_photon_slice in gpu_photons.iterate_copies():
+                for idaq in xrange(ndaq):
+                    gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+                    self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
         
         return self.gpu_pdf.get_pdf_eval()
 
diff --git a/src/propagate.cu b/src/propagate.cu
index 2d5183e..ecc32c0 100644
--- a/src/propagate.cu
+++ b/src/propagate.cu
@@ -9,6 +9,44 @@ extern "C"
 {
 
 __global__ void
+photon_duplicate(int first_photon, int nthreads,
+		 float3 *positions, float3 *directions,
+		 float *wavelengths, float3 *polarizations,
+		 float *times, unsigned int *histories,
+		 int *last_hit_triangles, 
+		 int copies, int stride)
+{
+    int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (id >= nthreads)
+	return;
+
+    int photon_id = first_photon + id;
+
+    Photon p;
+    p.position = positions[photon_id];
+    p.direction = directions[photon_id];
+    p.polarization = polarizations[photon_id];
+    p.wavelength = wavelengths[photon_id];
+    p.time = times[photon_id];
+    p.last_hit_triangle = last_hit_triangles[photon_id];
+    p.history = histories[photon_id];
+
+    for (int i=1; i <= copies; i++) {
+      int target_photon_id = photon_id + stride * i;
+
+      positions[target_photon_id] = p.position;
+      directions[target_photon_id] = p.direction;
+      polarizations[target_photon_id] = p.polarization;
+      wavelengths[target_photon_id] = p.wavelength;
+      times[target_photon_id] = p.time;
+      last_hit_triangles[target_photon_id] = p.last_hit_triangle;
+      histories[target_photon_id] = p.history;
+    }
+}
+		 
+
+__global__ void
 propagate(int first_photon, int nthreads, unsigned int *input_queue,
 	  unsigned int *output_queue, curandState *rng_states,
 	  float3 *positions, float3 *directions,