7 files changed, 349 insertions, 122 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py
index 44cf44b..06a8eba 100755
--- a/chroma/benchmark.py
+++ b/chroma/benchmark.py
@@ -146,6 +146,7 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1,
 
     return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times)))
 
+@tools.profile_if_possible
 def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
              nthreads_per_block=64, max_blocks=1024):
     """
@@ -179,6 +180,7 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
     data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
     
     # Setup PDF evaluation
+    gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq)
     gpu_pdf = gpu.GPUPDF()
     gpu_pdf.setup_pdf_eval(data_ev_channels.hit,
                            data_ev_channels.t,
@@ -205,10 +207,10 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
             gpu_photons.propagate(gpu_detector, rng_states,
                                   nthreads_per_block, max_blocks)
             for gpu_photon_slice in gpu_photons.iterate_copies():
-                for idaq in xrange(ndaq):
-                    gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
-                                                   nthreads_per_block, max_blocks)
-                    gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
+                gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT)
+                gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
+                                               nthreads_per_block, max_blocks)
+                gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
 
         cuda.Context.get_current().synchronize()        
         elapsed = time.time() - t0
@@ -224,6 +226,8 @@ if __name__ == '__main__':
     from chroma import demo
     import gc
 
+    tools.enable_debug_on_crash()
+
     # Default to run all tests
     tests = ['ray', 'load', 'propagate', 'pdf', 'pdf_eval']
     if len(sys.argv) > 1:
diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu
index b995bfb..3ed402d 100644
--- a/chroma/cuda/daq.cu
+++ b/chroma/cuda/daq.cu
@@ -79,6 +79,65 @@ run_daq(curandState *s, unsigned int detection_state,
 }
 
 __global__ void
+run_daq_many(curandState *s, unsigned int detection_state,
+	     int first_photon, int nphotons, float *photon_times,
+	     unsigned int *photon_histories, int *last_hit_triangles,
+	     int *solid_map,
+	     Detector *detector,
+	     unsigned int *earliest_time_int,
+	     unsigned int *channel_q_int, unsigned int *channel_histories,
+	     int ndaq, int channel_stride)
+{
+    __shared__ int photon_id;
+    __shared__ int triangle_id;
+    __shared__ int solid_id;
+    __shared__ int channel_index;
+    __shared__ unsigned int history;
+    __shared__ float photon_time;
+    
+
+    if (threadIdx.x == 0) {
+	photon_id = first_photon + blockIdx.x;
+	triangle_id = last_hit_triangles[photon_id];
+	
+	if (triangle_id > -1) {
+	    solid_id = solid_map[triangle_id];
+	    history = photon_histories[photon_id];
+	    channel_index = detector->solid_id_to_channel_index[solid_id];
+	    photon_time = photon_times[photon_id];
+	}
+    }
+
+    __syncthreads();
+
+    if (triangle_id == -1 || channel_index < 0 || !(history & detection_state))
+      return;
+
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    curandState rng = s[id];
+
+    for (int i = threadIdx.x; i < ndaq; i += blockDim.x) {
+	int channel_offset = channel_index + i * channel_stride;
+
+	float time = photon_time + curand_normal(&rng) * 1.2f;// + 
+	//sample_cdf(&rng, detector->time_cdf_len,
+	//	       detector->time_cdf_x, detector->time_cdf_y);
+	unsigned int time_int = float_to_sortable_int(time);
+	
+	float charge = 1.0f; //sample_cdf(&rng, detector->charge_cdf_len,
+	//detector->charge_cdf_x,
+	//detector->charge_cdf_y);
+	unsigned int charge_int = roundf(charge / detector->charge_unit);
+	
+	atomicMin(earliest_time_int + channel_offset, time_int);
+	atomicAdd(channel_q_int + channel_offset, charge_int);
+	atomicOr(channel_histories + channel_offset, history);
+    }
+
+    s[id] = rng;
+}
+
+__global__ void
 convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
 			      float *float_output)
 {
diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu
index bf42742..a9a9339 100644
--- a/chroma/cuda/pdf.cu
+++ b/chroma/cuda/pdf.cu
@@ -1,6 +1,8 @@
 // -*-c++-*-
 #include <curand_kernel.h>
 
+#include "sorting.h"
+
 extern "C"
 {
 
@@ -28,101 +30,196 @@ bin_hits(int nchannels, float *channel_q, float *channel_time,
 	pdf[bin] += 1;
     }
 }
-	
+
 __global__ void
-accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
-		    float *event_time, float *event_charge, float *mc_time,
-		    float *mc_charge, // quirk of DAQ!
+accumulate_bincount(int nchannels,
+		    int ndaq,
+		    unsigned int *event_hit,
+		    float *event_time, 
+		    float *mc_time,
 		    unsigned int *hitcount, unsigned int *bincount,
 		    float min_twidth, float tmin, float tmax,
-		    float min_qwidth, float qmin, float qmax,
-		    float *nearest_mc, int min_bin_content)
+		    int min_bin_content,
+		    unsigned int *map_channel_id_to_hit_offset,
+		    unsigned int *work_queues)
 {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int channel_id = threadIdx.x + blockDim.x * blockIdx.x;
 	
-    if (id >= nchannels)
+    if (channel_id >= nchannels)
 	return;
+
+    float channel_hitcount = hitcount[channel_id];
+    float channel_bincount = bincount[channel_id];
+    float channel_event_time = event_time[channel_id];
+    int channel_event_hit = event_hit[channel_id];
+    
+    unsigned int *work_queue = work_queues + map_channel_id_to_hit_offset[channel_id] * (ndaq + 1);
+    unsigned int next_slot = work_queue[0];
 	
-    // Was this channel hit in the Monte Carlo?
-    float channel_mc_time = mc_time[id];
-    if (channel_mc_time >= 1e8)
-	return; // Nothing else to do
+    for (int i=0; i < ndaq; i++) {
+	int read_offset = nchannels * i + channel_id;
+
+	// Was this channel hit in the Monte Carlo?
+	float channel_mc_time = mc_time[read_offset];
+	if (channel_mc_time >= 1e8)
+	    continue; // Nothing else to do
 	
-    // Is this channel inside the range of the PDF?
-    float distance;
-    int channel_bincount = bincount[id];
-    if (time_only) {
+	// Is this channel inside the range of the PDF?
+	float distance;
 	if (channel_mc_time < tmin || channel_mc_time > tmax)
-	    return;  // Nothing else to do
+	    continue;  // Nothing else to do
 		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
+	channel_hitcount += 1;
+
 	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
 	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
+	    continue; // No need to update PDF value for unhit channel
+    
 	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
 	distance = fabsf(channel_mc_time - channel_event_time);
-	if (distance < min_twidth/2.0)
-	    bincount[id] = channel_bincount + 1;
+	if (distance < min_twidth/2.0) {
+	    channel_bincount += 1;
+	}
 
+	// Add this hit to the work queue if we also need to sort it into the 
+	// nearest_mc_list
+	if (channel_bincount < min_bin_content) {
+	    work_queue[next_slot] = read_offset;
+	    next_slot++;
+	}
     }
-    else { // time and charge PDF
-	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+
+    hitcount[channel_id] = channel_hitcount;
+    bincount[channel_id] = channel_bincount;
+    if (channel_event_hit)
+	work_queue[0] = next_slot;
+}
 	
-	if (channel_mc_time < tmin || channel_mc_time > tmax ||
-	    channel_mc_charge < qmin || channel_mc_charge > qmax)
-	    return;  // Nothing else to do
-		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
-	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
-	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
-	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
-	float channel_event_charge = event_charge[id];
-	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
-	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
-	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+__global__ void
+accumulate_nearest_neighbor(int nhit,
+			    int ndaq,
+			    unsigned int *map_hit_offset_to_channel_id,
+                            unsigned int *work_queues,
+			    float *event_time,
+			    float *mc_time,
+			    float *nearest_mc, int min_bin_content)
+{
+    int hit_id = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    if (hit_id >= nhit)
+	return;
+
+    unsigned int *work_queue = work_queues + hit_id * (ndaq + 1);
+    int queue_items = work_queue[0] - 1;
+
+    int channel_id = map_hit_offset_to_channel_id[hit_id];
+    float channel_event_time = event_time[channel_id];
+    
+    float distance_table[1000];
+    int distance_table_len = 0;
+
+    // Load existing distance table
+    int offset = min_bin_content * hit_id;    
+    for (int i=0; i < min_bin_content; i++) {
+      float d = nearest_mc[offset + i];
+      if (d > 1e8)
+	break;
 
-	if (distance < 1.0f)
-	    bincount[id] = channel_bincount + 1;
+      distance_table[distance_table_len] = d;
+      distance_table_len++;
     }
+    
+    // append new entries
+    for (int i=0; i < queue_items; i++) {
+	unsigned int read_offset = work_queue[i+1];
+	float channel_mc_time = mc_time[read_offset];
+	float distance = fabsf(channel_mc_time - channel_event_time);
 
-    // Do we need to keep updating the nearest_mc list?
-    if (channel_bincount >= min_bin_content)
-      return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+	distance_table[distance_table_len] = distance;
+	distance_table_len++;
+    }
 
-    // insertion sort the distance into the array of nearest MC points
-    int offset = min_bin_content * id;
-    int entry = min_bin_content - 1;
-	
-    // If last entry less than new entry, nothing to update
-    if (distance > nearest_mc[offset + entry])
-	return;
+    // Sort table
+    piksrt(distance_table_len, distance_table);
+    
+    // Copy first section of table back out to global memory
+    distance_table_len = min(distance_table_len, min_bin_content);
+    for (int i=0; i < distance_table_len; i++) {
+      nearest_mc[offset + i] = distance_table[i];
+    }
+}
+
+__global__ void
+accumulate_nearest_neighbor_block(int nhit,
+			    int ndaq,
+			    unsigned int *map_hit_offset_to_channel_id,
+                            unsigned int *work_queues,
+			    float *event_time,
+			    float *mc_time,
+			    float *nearest_mc, int min_bin_content)
+{
+    int hit_id = blockIdx.x;
+    
+    __shared__ float distance_table[1000];
+    __shared__ unsigned int *work_queue;
+    __shared__ int queue_items;
+    __shared__ int channel_id;
+    __shared__ float channel_event_time;
+    __shared__ int distance_table_len;
+    __shared__ int offset;
+
+    if (threadIdx.x == 0) {
+      work_queue = work_queues + hit_id * (ndaq + 1);
+      queue_items = work_queue[0] - 1;
+
+      channel_id = map_hit_offset_to_channel_id[hit_id];
+      channel_event_time = event_time[channel_id];
+      distance_table_len = min_bin_content;
+      offset = min_bin_content * hit_id;    
+    }
+
+    __syncthreads();
 
-    // Find where to insert the new entry while sliding the rest
-    // to the right
-    entry--;
-    while (entry >= 0) {
-	if (nearest_mc[offset+entry] >= distance)
-	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
-	else
-	    break;
-	entry--;
+    // Load existing distance table
+    for (int i=threadIdx.x; i < min_bin_content; i += blockDim.x) {
+      float d = nearest_mc[offset + i];
+      if (d > 1e8) {
+	atomicMin(&distance_table_len, i);
+	break;
+      }
+      distance_table[i] = d;
     }
+    
+    __syncthreads();
 
-    nearest_mc[offset+entry+1] = distance;
+    // append new entries
+    for (int i=threadIdx.x; i < queue_items; i += blockDim.x) {
+	unsigned int read_offset = work_queue[i+1];
+	float channel_mc_time = mc_time[read_offset];
+	float distance = fabsf(channel_mc_time - channel_event_time);
+
+	distance_table[distance_table_len + i] = distance;
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      distance_table_len += queue_items;
+      // Sort table
+      piksrt(distance_table_len, distance_table);
+      // Copy first section of table back out to global memory
+      distance_table_len = min(distance_table_len, min_bin_content);
+    }
+    
+    __syncthreads();
+
+    for (int i=threadIdx.x; i < distance_table_len; i += blockDim.x) {
+      nearest_mc[offset + i] = distance_table[i];
+    }
 }
 
 
+
 __global__ void
 accumulate_moments(int time_only, int nchannels,
 		   float *mc_time,
diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py
index 571feba..ec5add4 100644
--- a/chroma/gpu/daq.py
+++ b/chroma/gpu/daq.py
@@ -1,15 +1,27 @@
 import numpy as np
 from pycuda import gpuarray as ga
+from pycuda import driver as cuda
 
 from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
     chunk_iterator
 from chroma import event
 
 class GPUChannels(object):
-    def __init__(self, t, q, flags):
+    def __init__(self, t, q, flags, ndaq=1, stride=None):
         self.t = t
         self.q = q
         self.flags = flags
+        self.ndaq = ndaq
+        if stride is None:
+            self.stride = len(t)
+        else:
+            self.stride = stride
+
+    def iterate_copies(self):
+        for i in xrange(self.ndaq):
+            yield GPUChannels(self.t[i*self.stride:(i+1)*self.stride],
+                              self.q[i*self.stride:(i+1)*self.stride],
+                              self.flags[i*self.stride:(i+1)*self.stride])
 
     def get(self):
         t = self.t.get()
@@ -19,10 +31,13 @@ class GPUChannels(object):
         # enough hit time were hit.
         return event.Channels(t<1e8, t, q, self.flags.get())
 
+    def __len__(self):
+        return self.t.size
+
 class GPUDaq(object):
-    def __init__(self, gpu_detector):
-        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels, dtype=np.float32)
-        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels, dtype=np.uint32)
+    def __init__(self, gpu_detector, ndaq=1):
+        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32)
+        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32)
         self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32)
@@ -33,29 +48,50 @@ class GPUDaq(object):
         self.module = get_cu_module('daq.cu', options=cuda_options, 
                                     include_source_directory=True)
         self.gpu_funcs = GPUFuncs(self.module)
+        self.ndaq = ndaq
+        self.stride = gpu_detector.nchannels
 
-    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
-        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None):
+        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(64,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
         self.channel_q_int_gpu.fill(0)
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
 
-        n = len(gpuphotons.pos)
-
-        for first_photon, photons_this_round, blocks in \
-                chunk_iterator(n, nthreads_per_block, max_blocks):
-            self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), 
-                                   np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, 
-                                   gpuphotons.flags, gpuphotons.last_hit_triangles, 
-                                   self.solid_id_map_gpu,
-                                   self.detector_gpu,
-                                   self.earliest_time_int_gpu, 
-                                   self.channel_q_int_gpu, self.channel_history_gpu, 
-                                   block=(nthreads_per_block,1,1), grid=(blocks,1))
+        if start_photon is None:
+            start_photon = 0
+        if nphotons is None:
+            nphotons = len(gpuphotons.pos) - start_photon
 
+        if self.ndaq == 1:
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), 
+                                       np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, 
+                                       gpuphotons.flags, gpuphotons.last_hit_triangles, 
+                                       self.solid_id_map_gpu,
+                                       self.detector_gpu,
+                                       self.earliest_time_int_gpu, 
+                                       self.channel_q_int_gpu, self.channel_history_gpu, 
+                                       block=(nthreads_per_block,1,1), grid=(blocks,1))
+        else:
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, 1, max_blocks):
+                self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), 
+                                            np.int32(start_photon), np.int32(photons_this_round), gpuphotons.t, 
+                                            gpuphotons.flags, gpuphotons.last_hit_triangles, 
+                                            self.solid_id_map_gpu,
+                                            self.detector_gpu,
+                                            self.earliest_time_int_gpu, 
+                                            self.channel_q_int_gpu, self.channel_history_gpu, 
+                                            np.int32(self.ndaq), np.int32(self.stride),
+                                            block=(nthreads_per_block,1,1), grid=(blocks,1))
+        cuda.Context.get_current().synchronize()
+            
         self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
 
         self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1))
 
-        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+        cuda.Context.get_current().synchronize()
+
+        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
 
diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py
index bd3c448..1b6340f 100644
--- a/chroma/gpu/pdf.py
+++ b/chroma/gpu/pdf.py
@@ -1,12 +1,13 @@
 import numpy as np
 from pycuda import gpuarray as ga
 import pycuda.driver as cuda
-from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, chunk_iterator
+from chroma.tools import profile_if_possible
 
 class GPUKernelPDF(object):
     def __init__(self):
         self.module = get_cu_module('pdf.cu', options=cuda_options,
-                                    include_source_directory=False)
+                                    include_source_directory=True)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_moments(self, nchannels, trange, qrange, time_only=True):
@@ -56,7 +57,6 @@ class GPUKernelPDF(object):
                                           self.qmom2_gpu,
                                           block=(nthreads_per_block,1,1), 
                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
-        cuda
         
     def compute_bandwidth(self, event_hit, event_time, event_charge, 
                           scale_factor=1.0):
@@ -177,7 +177,7 @@ class GPUKernelPDF(object):
 class GPUPDF(object):
     def __init__(self):
         self.module = get_cu_module('pdf.cu', options=cuda_options,
-                                    include_source_directory=False)
+                                    include_source_directory=True)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
@@ -259,21 +259,31 @@ class GPUPDF(object):
             time_only: bool
               If True, only the time observable will be used in the PDF.
         """
+        self.event_nhit = np.count_nonzero(event_hit)
+        
+        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
+        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32)
+        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id)
+        self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32)
+        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset)
+
         self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
         self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
         self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
 
         self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
         self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
-        self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, 
+        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, 
                                              dtype=np.float32)
         self.nearest_mc_gpu.fill(1e9)
-
+        
         self.min_twidth = min_twidth
         self.trange = trange
         self.min_qwidth = min_qwidth
         self.qrange = qrange
         self.min_bin_content = min_bin_content
+
+        assert time_only # Only support time right now
         self.time_only = time_only
 
     def clear_pdf_eval(self):
@@ -282,27 +292,38 @@ class GPUPDF(object):
         self.eval_bincount_gpu.fill(0)
         self.nearest_mc_gpu.fill(1e9)
 
-    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+    @profile_if_possible
+    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000):
         "Add the most recent results of run_daq() to the PDF evaluation."
-        self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
-                                           np.int32(len(self.event_hit_gpu)),
+        self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32)
+        self.work_queues.fill(1)
+
+        self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size),
+                                           np.int32(gpuchannels.ndaq),
                                            self.event_hit_gpu,
                                            self.event_time_gpu,
-                                           self.event_charge_gpu,
                                            gpuchannels.t,
-                                           gpuchannels.q,
                                            self.eval_hitcount_gpu,
                                            self.eval_bincount_gpu,
                                            np.float32(self.min_twidth),
                                            np.float32(self.trange[0]),
                                            np.float32(self.trange[1]),
-                                           np.float32(self.min_qwidth),
-                                           np.float32(self.qrange[0]),
-                                           np.float32(self.qrange[1]),
-                                           self.nearest_mc_gpu,
                                            np.int32(self.min_bin_content),
+                                           self.map_channel_id_to_hit_offset_gpu,
+                                           self.work_queues,
                                            block=(nthreads_per_block,1,1), 
-                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+                                           grid=(self.event_hit_gpu.size//nthreads_per_block+1,1))
+        self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit),
+                                                         np.int32(gpuchannels.ndaq),
+                                                         self.map_hit_offset_to_channel_id_gpu,
+                                                         self.work_queues,
+                                                         self.event_time_gpu,
+                                                         gpuchannels.t,
+                                                         self.nearest_mc_gpu,
+                                                         np.int32(self.min_bin_content),
+                                                         block=(nthreads_per_block,1,1), 
+                                                         grid=(self.event_nhit,1))
+        cuda.Context.get_current().synchronize()
 
     def get_pdf_eval(self):
         evhit = self.event_hit_gpu.get().astype(bool)
@@ -325,14 +346,16 @@ class GPUPDF(object):
         # PDF value for low stats bins
         low_stats = ~high_stats & (hitcount > 0) & evhit
 
-        nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
+        nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content))
+        nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32)
+        nearest_mc.fill(1e9)
+        nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit
 
         # Deal with the case where we did not even get min_bin_content events
         # in the PDF but also clamp the lower range to ensure we don't index
         # by a negative number in 2 lines
         last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
         distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
-
         if low_stats.any():
             if self.time_only:
                 pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
@@ -344,5 +367,4 @@ class GPUPDF(object):
         # PDFs with no stats got zero by default during array creation
         
         print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
-
         return hitcount, pdf_value, pdf_value * pdf_frac_uncert
diff --git a/chroma/likelihood.py b/chroma/likelihood.py
index 5e8e678..a29dc6c 100644
--- a/chroma/likelihood.py
+++ b/chroma/likelihood.py
@@ -57,11 +57,12 @@ class Likelihood(object):
         hitcount, pdf_prob, pdf_prob_uncert = \
             self.sim.eval_pdf(self.event.channels,
                               vertex_generator,
-                              0.1, self.trange, 
+                              0.2, self.trange, 
                               1, self.qrange,
                               nreps=nreps,
                               ndaq=ndaq,
-                              time_only=self.time_only)
+                              time_only=self.time_only,
+                              min_bin_content=ndaq*10)
         
         # Normalize probabilities and put a floor to keep the log finite
         hit_prob = hitcount.astype(np.float32) / ntotal
@@ -80,7 +81,6 @@ class Likelihood(object):
 
         return hit_prob, pdf_prob, pdf_prob_uncert
         
-    @profile_if_possible
     def eval(self, vertex_generator, nevals, nreps=16, ndaq=50):
         """
         Return the negative log likelihood that the detector event set in the
@@ -100,9 +100,8 @@ class Likelihood(object):
 
         # Then include the probability densities of the observed
         # charges and times.
-        hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit],
-                                        pdf_prob_uncert[self.event.channels.hit]))
-        log_likelihood += unumpy.log(hit_pdf_ufloat).sum()
+        log_likelihood += ufloat((np.log(pdf_prob[self.event.channels.hit]).sum(),
+                                  0.0))
         
         return -log_likelihood
 
diff --git a/chroma/sim.py b/chroma/sim.py
index 93bcdfb..a82f1f8 100644
--- a/chroma/sim.py
+++ b/chroma/sim.py
@@ -9,13 +9,14 @@ from chroma import event
 from chroma import itertoolset
 from chroma.tools import profile_if_possible
 
+import pycuda.driver as cuda
+
 def pick_seed():
     """Returns a seed for a random number generator selected using
     a mixture of the current time and the current process ID."""
     return int(time.time()) ^ (os.getpid() << 16)
 
 class Simulation(object):
-    @profile_if_possible
     def __init__(self, detector, seed=None, cuda_device=None,
                  geant4_processes=4, nthreads_per_block=64, max_blocks=1024):
         self.detector = detector
@@ -122,9 +123,12 @@ class Simulation(object):
         
         return self.gpu_pdf.get_pdfs()
 
-    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=50, nreps=1, ndaq=1, time_only=True):
+    @profile_if_possible
+    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=100, nreps=1, ndaq=1, time_only=True):
         """Returns tuple: 1D array of channel hit counts, 1D array of PDF
         probability densities."""
+        gpu_daq = gpu.GPUDaq(self.gpu_geometry, ndaq=ndaq)
+
         self.gpu_pdf.setup_pdf_eval(event_channels.hit,
                                     event_channels.t,
                                     event_channels.q,
@@ -145,10 +149,16 @@ class Simulation(object):
             gpu_photons.propagate(self.gpu_geometry, self.rng_states,
                                   nthreads_per_block=self.nthreads_per_block,
                                   max_blocks=self.max_blocks)
-            for gpu_photon_slice in gpu_photons.iterate_copies():
-                for idaq in xrange(ndaq):
-                    gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
-                    self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
+            nphotons = gpu_photons.true_nphotons
+            for i in xrange(gpu_photons.ncopies):
+                start_photon = i * nphotons
+                gpu_photon_slice = gpu_photons.select(event.SURFACE_DETECT,
+                                                           start_photon=start_photon,
+                                                           nphotons=nphotons)
+                if len(gpu_photon_slice) == 0:
+                    continue
+                gpu_channels = gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+                self.gpu_pdf.accumulate_pdf_eval(gpu_channels, 32)
         
         return self.gpu_pdf.get_pdf_eval()