Major overhaul to the way that DAQ and PDF information is accumulated

to speed up likelihood evaluation. When generating a likelihood, the DAQ can be run many times in parallel by the GPU, creating a large block of channel hit information in memory. The PDF accumulator processes that entire block in two passes: * First update the channel hit count, and the count of channel hits falling into the bin around the channel hit time being evaluated. Add any channel hits that should also be included in the n-th nearest neighbor calculation to a channel-specific work queue. * Process all the work queues for each channel and update the list of nearest neighbors. This is hugely faster than what we were doing before. Kernel estimation (or some kind of orthogonal function expansion of the PDF) should be better ultimately, but for now the nearest neighbor approach to PDF estimation seems to be working the best.
author: Stan Seibert <stan@mtrr.org> 2011-10-21 17:02:11 -0400
committer: tlatorre <tlatorre@uchicago.edu> 2021-05-09 08:42:38 -0700
commit: a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d (patch)
tree: e7ca82bad70c395689ccd4284ff8e527bfa54c8e
parent: ce45e90e90f801bf7b523c97399b1986c8bfa97e (diff)
download: chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.gz
chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.bz2
chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.zip
7 files changed, 349 insertions, 122 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py
index 44cf44b..06a8eba 100755
--- a/chroma/benchmark.py
+++ b/chroma/benchmark.py
@@ -146,6 +146,7 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1,
 
     return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times)))
 
+@tools.profile_if_possible
 def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
              nthreads_per_block=64, max_blocks=1024):
     """
@@ -179,6 +180,7 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
     data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
     
     # Setup PDF evaluation
+    gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq)
     gpu_pdf = gpu.GPUPDF()
     gpu_pdf.setup_pdf_eval(data_ev_channels.hit,
                            data_ev_channels.t,
@@ -205,10 +207,10 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
             gpu_photons.propagate(gpu_detector, rng_states,
                                   nthreads_per_block, max_blocks)
             for gpu_photon_slice in gpu_photons.iterate_copies():
-                for idaq in xrange(ndaq):
-                    gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
-                                                   nthreads_per_block, max_blocks)
-                    gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
+                gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT)
+                gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
+                                               nthreads_per_block, max_blocks)
+                gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
 
         cuda.Context.get_current().synchronize()        
         elapsed = time.time() - t0
@@ -224,6 +226,8 @@ if __name__ == '__main__':
     from chroma import demo
     import gc
 
+    tools.enable_debug_on_crash()
+
     # Default to run all tests
     tests = ['ray', 'load', 'propagate', 'pdf', 'pdf_eval']
     if len(sys.argv) > 1:
diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu
index b995bfb..3ed402d 100644
--- a/chroma/cuda/daq.cu
+++ b/chroma/cuda/daq.cu
@@ -79,6 +79,65 @@ run_daq(curandState *s, unsigned int detection_state,
 }
 
 __global__ void
+run_daq_many(curandState *s, unsigned int detection_state,
+	     int first_photon, int nphotons, float *photon_times,
+	     unsigned int *photon_histories, int *last_hit_triangles,
+	     int *solid_map,
+	     Detector *detector,
+	     unsigned int *earliest_time_int,
+	     unsigned int *channel_q_int, unsigned int *channel_histories,
+	     int ndaq, int channel_stride)
+{
+    __shared__ int photon_id;
+    __shared__ int triangle_id;
+    __shared__ int solid_id;
+    __shared__ int channel_index;
+    __shared__ unsigned int history;
+    __shared__ float photon_time;
+    
+
+    if (threadIdx.x == 0) {
+	photon_id = first_photon + blockIdx.x;
+	triangle_id = last_hit_triangles[photon_id];
+	
+	if (triangle_id > -1) {
+	    solid_id = solid_map[triangle_id];
+	    history = photon_histories[photon_id];
+	    channel_index = detector->solid_id_to_channel_index[solid_id];
+	    photon_time = photon_times[photon_id];
+	}
+    }
+
+    __syncthreads();
+
+    if (triangle_id == -1 || channel_index < 0 || !(history & detection_state))
+      return;
+
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    curandState rng = s[id];
+
+    for (int i = threadIdx.x; i < ndaq; i += blockDim.x) {
+	int channel_offset = channel_index + i * channel_stride;
+
+	float time = photon_time + curand_normal(&rng) * 1.2f;// + 
+	//sample_cdf(&rng, detector->time_cdf_len,
+	//	       detector->time_cdf_x, detector->time_cdf_y);
+	unsigned int time_int = float_to_sortable_int(time);
+	
+	float charge = 1.0f; //sample_cdf(&rng, detector->charge_cdf_len,
+	//detector->charge_cdf_x,
+	//detector->charge_cdf_y);
+	unsigned int charge_int = roundf(charge / detector->charge_unit);
+	
+	atomicMin(earliest_time_int + channel_offset, time_int);
+	atomicAdd(channel_q_int + channel_offset, charge_int);
+	atomicOr(channel_histories + channel_offset, history);
+    }
+
+    s[id] = rng;
+}
+
+__global__ void
 convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
 			      float *float_output)
 {
diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu
index bf42742..a9a9339 100644
--- a/chroma/cuda/pdf.cu
+++ b/chroma/cuda/pdf.cu
@@ -1,6 +1,8 @@
 // -*-c++-*-
 #include <curand_kernel.h>
 
+#include "sorting.h"
+
 extern "C"
 {
 
@@ -28,101 +30,196 @@ bin_hits(int nchannels, float *channel_q, float *channel_time,
 	pdf[bin] += 1;
     }
 }
-	
+
 __global__ void
-accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
-		    float *event_time, float *event_charge, float *mc_time,
-		    float *mc_charge, // quirk of DAQ!
+accumulate_bincount(int nchannels,
+		    int ndaq,
+		    unsigned int *event_hit,
+		    float *event_time, 
+		    float *mc_time,
 		    unsigned int *hitcount, unsigned int *bincount,
 		    float min_twidth, float tmin, float tmax,
-		    float min_qwidth, float qmin, float qmax,
-		    float *nearest_mc, int min_bin_content)
+		    int min_bin_content,
+		    unsigned int *map_channel_id_to_hit_offset,
+		    unsigned int *work_queues)
 {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int channel_id = threadIdx.x + blockDim.x * blockIdx.x;
 	
-    if (id >= nchannels)
+    if (channel_id >= nchannels)
 	return;
+
+    float channel_hitcount = hitcount[channel_id];
+    float channel_bincount = bincount[channel_id];
+    float channel_event_time = event_time[channel_id];
+    int channel_event_hit = event_hit[channel_id];
+    
+    unsigned int *work_queue = work_queues + map_channel_id_to_hit_offset[channel_id] * (ndaq + 1);
+    unsigned int next_slot = work_queue[0];
 	
-    // Was this channel hit in the Monte Carlo?
-    float channel_mc_time = mc_time[id];
-    if (channel_mc_time >= 1e8)
-	return; // Nothing else to do
+    for (int i=0; i < ndaq; i++) {
+	int read_offset = nchannels * i + channel_id;
+
+	// Was this channel hit in the Monte Carlo?
+	float channel_mc_time = mc_time[read_offset];
+	if (channel_mc_time >= 1e8)
+	    continue; // Nothing else to do
 	
-    // Is this channel inside the range of the PDF?
-    float distance;
-    int channel_bincount = bincount[id];
-    if (time_only) {
+	// Is this channel inside the range of the PDF?
+	float distance;
 	if (channel_mc_time < tmin || channel_mc_time > tmax)
-	    return;  // Nothing else to do
+	    continue;  // Nothing else to do
 		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
+	channel_hitcount += 1;
+
 	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
 	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
+	    continue; // No need to update PDF value for unhit channel
+    
 	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
 	distance = fabsf(channel_mc_time - channel_event_time);
-	if (distance < min_twidth/2.0)
-	    bincount[id] = channel_bincount + 1;
+	if (distance < min_twidth/2.0) {
+	    channel_bincount += 1;
+	}
 
+	// Add this hit to the work queue if we also need to sort it into the 
+	// nearest_mc_list
+	if (channel_bincount < min_bin_content) {
+	    work_queue[next_slot] = read_offset;
+	    next_slot++;
+	}
     }
-    else { // time and charge PDF
-	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+
+    hitcount[channel_id] = channel_hitcount;
+    bincount[channel_id] = channel_bincount;
+    if (channel_event_hit)
+	work_queue[0] = next_slot;
+}
 	
-	if (channel_mc_time < tmin || channel_mc_time > tmax ||
-	    channel_mc_charge < qmin || channel_mc_charge > qmax)
-	    return;  // Nothing else to do
-		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
-	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
-	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
-	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
-	float channel_event_charge = event_charge[id];
-	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
-	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
-	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+__global__ void
+accumulate_nearest_neighbor(int nhit,
+			    int ndaq,
+			    unsigned int *map_hit_offset_to_channel_id,
+                            unsigned int *work_queues,
+			    float *event_time,
+			    float *mc_time,
+			    float *nearest_mc, int min_bin_content)
+{
+    int hit_id = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    if (hit_id >= nhit)
+	return;
+
+    unsigned int *work_queue = work_queues + hit_id * (ndaq + 1);
+    int queue_items = work_queue[0] - 1;
+
+    int channel_id = map_hit_offset_to_channel_id[hit_id];
+    float channel_event_time = event_time[channel_id];
+    
+    float distance_table[1000];
+    int distance_table_len = 0;
+
+    // Load existing distance table
+    int offset = min_bin_content * hit_id;    
+    for (int i=0; i < min_bin_content; i++) {
+      float d = nearest_mc[offset + i];
+      if (d > 1e8)
+	break;
 
-	if (distance < 1.0f)
-	    bincount[id] = channel_bincount + 1;
+      distance_table[distance_table_len] = d;
+      distance_table_len++;
     }
+    
+    // append new entries
+    for (int i=0; i < queue_items; i++) {
+	unsigned int read_offset = work_queue[i+1];
+	float channel_mc_time = mc_time[read_offset];
+	float distance = fabsf(channel_mc_time - channel_event_time);
 
-    // Do we need to keep updating the nearest_mc list?
-    if (channel_bincount >= min_bin_content)
-      return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+	distance_table[distance_table_len] = distance;
+	distance_table_len++;
+    }
 
-    // insertion sort the distance into the array of nearest MC points
-    int offset = min_bin_content * id;
-    int entry = min_bin_content - 1;
-	
-    // If last entry less than new entry, nothing to update
-    if (distance > nearest_mc[offset + entry])
-	return;
+    // Sort table
+    piksrt(distance_table_len, distance_table);
+    
+    // Copy first section of table back out to global memory
+    distance_table_len = min(distance_table_len, min_bin_content);
+    for (int i=0; i < distance_table_len; i++) {
+      nearest_mc[offset + i] = distance_table[i];
+    }
+}
+
+__global__ void
+accumulate_nearest_neighbor_block(int nhit,
+			    int ndaq,
+			    unsigned int *map_hit_offset_to_channel_id,
+                            unsigned int *work_queues,
+			    float *event_time,
+			    float *mc_time,
+			    float *nearest_mc, int min_bin_content)
+{
+    int hit_id = blockIdx.x;
+    
+    __shared__ float distance_table[1000];
+    __shared__ unsigned int *work_queue;
+    __shared__ int queue_items;
+    __shared__ int channel_id;
+    __shared__ float channel_event_time;
+    __shared__ int distance_table_len;
+    __shared__ int offset;
+
+    if (threadIdx.x == 0) {
+      work_queue = work_queues + hit_id * (ndaq + 1);
+      queue_items = work_queue[0] - 1;
+
+      channel_id = map_hit_offset_to_channel_id[hit_id];
+      channel_event_time = event_time[channel_id];
+      distance_table_len = min_bin_content;
+      offset = min_bin_content * hit_id;    
+    }
+
+    __syncthreads();
 
-    // Find where to insert the new entry while sliding the rest
-    // to the right
-    entry--;
-    while (entry >= 0) {
-	if (nearest_mc[offset+entry] >= distance)
-	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
-	else
-	    break;
-	entry--;
+    // Load existing distance table
+    for (int i=threadIdx.x; i < min_bin_content; i += blockDim.x) {
+      float d = nearest_mc[offset + i];
+      if (d > 1e8) {
+	atomicMin(&distance_table_len, i);
+	break;
+      }
+      distance_table[i] = d;
     }
+    
+    __syncthreads();
 
-    nearest_mc[offset+entry+1] = distance;
+    // append new entries
+    for (int i=threadIdx.x; i < queue_items; i += blockDim.x) {
+	unsigned int read_offset = work_queue[i+1];
+	float channel_mc_time = mc_time[read_offset];
+	float distance = fabsf(channel_mc_time - channel_event_time);
+
+	distance_table[distance_table_len + i] = distance;
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      distance_table_len += queue_items;
+      // Sort table
+      piksrt(distance_table_len, distance_table);
+      // Copy first section of table back out to global memory
+      distance_table_len = min(distance_table_len, min_bin_content);
+    }
+    
+    __syncthreads();
+
+    for (int i=threadIdx.x; i < distance_table_len; i += blockDim.x) {
+      nearest_mc[offset + i] = distance_table[i];
+    }
 }
 
 
+
 __global__ void
 accumulate_moments(int time_only, int nchannels,
 		   float *mc_time,
diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py
index 571feba..ec5add4 100644
--- a/chroma/gpu/daq.py
+++ b/chroma/gpu/daq.py
@@ -1,15 +1,27 @@
 import numpy as np
 from pycuda import gpuarray as ga
+from pycuda import driver as cuda
 
 from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
     chunk_iterator
 from chroma import event
 
 class GPUChannels(object):
-    def __init__(self, t, q, flags):
+    def __init__(self, t, q, flags, ndaq=1, stride=None):
         self.t = t
         self.q = q
         self.flags = flags
+        self.ndaq = ndaq
+        if stride is None:
+            self.stride = len(t)
+        else:
+            self.stride = stride
+
+    def iterate_copies(self):
+        for i in xrange(self.ndaq):
+            yield GPUChannels(self.t[i*self.stride:(i+1)*self.stride],
+                              self.q[i*self.stride:(i+1)*self.stride],
+                              self.flags[i*self.stride:(i+1)*self.stride])
 
     def get(self):
         t = self.t.get()
@@ -19,10 +31,13 @@ class GPUChannels(object):
         # enough hit time were hit.
         return event.Channels(t<1e8, t, q, self.flags.get())
 
+    def __len__(self):
+        return self.t.size
+
 class GPUDaq(object):
-    def __init__(self, gpu_detector):
-        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels, dtype=np.float32)
-        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels, dtype=np.uint32)
+    def __init__(self, gpu_detector, ndaq=1):
+        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32)
+        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32)
         self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
         self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32)
@@ -33,29 +48,50 @@ class GPUDaq(object):
         self.module = get_cu_module('daq.cu', options=cuda_options, 
                                     include_source_directory=True)
         self.gpu_funcs = GPUFuncs(self.module)
+        self.ndaq = ndaq
+        self.stride = gpu_detector.nchannels
 
-    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
-        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None):
+        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(64,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
         self.channel_q_int_gpu.fill(0)
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
 
-        n = len(gpuphotons.pos)
-
-        for first_photon, photons_this_round, blocks in \
-                chunk_iterator(n, nthreads_per_block, max_blocks):
-            self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), 
-                                   np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, 
-                                   gpuphotons.flags, gpuphotons.last_hit_triangles, 
-                                   self.solid_id_map_gpu,
-                                   self.detector_gpu,
-                                   self.earliest_time_int_gpu, 
-                                   self.channel_q_int_gpu, self.channel_history_gpu, 
-                                   block=(nthreads_per_block,1,1), grid=(blocks,1))
+        if start_photon is None:
+            start_photon = 0
+        if nphotons is None:
+            nphotons = len(gpuphotons.pos) - start_photon
 
+        if self.ndaq == 1:
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+                self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), 
+                                       np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, 
+                                       gpuphotons.flags, gpuphotons.last_hit_triangles, 
+                                       self.solid_id_map_gpu,
+                                       self.detector_gpu,
+                                       self.earliest_time_int_gpu, 
+                                       self.channel_q_int_gpu, self.channel_history_gpu, 
+                                       block=(nthreads_per_block,1,1), grid=(blocks,1))
+        else:
+            for first_photon, photons_this_round, blocks in \
+                    chunk_iterator(nphotons, 1, max_blocks):
+                self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), 
+                                            np.int32(start_photon), np.int32(photons_this_round), gpuphotons.t, 
+                                            gpuphotons.flags, gpuphotons.last_hit_triangles, 
+                                            self.solid_id_map_gpu,
+                                            self.detector_gpu,
+                                            self.earliest_time_int_gpu, 
+                                            self.channel_q_int_gpu, self.channel_history_gpu, 
+                                            np.int32(self.ndaq), np.int32(self.stride),
+                                            block=(nthreads_per_block,1,1), grid=(blocks,1))
+        cuda.Context.get_current().synchronize()
+            
         self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
 
         self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1))
 
-        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+        cuda.Context.get_current().synchronize()
+
+        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
 
diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py
index bd3c448..1b6340f 100644
--- a/chroma/gpu/pdf.py
+++ b/chroma/gpu/pdf.py
@@ -1,12 +1,13 @@
 import numpy as np
 from pycuda import gpuarray as ga
 import pycuda.driver as cuda
-from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, chunk_iterator
+from chroma.tools import profile_if_possible
 
 class GPUKernelPDF(object):
     def __init__(self):
         self.module = get_cu_module('pdf.cu', options=cuda_options,
-                                    include_source_directory=False)
+                                    include_source_directory=True)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_moments(self, nchannels, trange, qrange, time_only=True):
@@ -56,7 +57,6 @@ class GPUKernelPDF(object):
                                           self.qmom2_gpu,
                                           block=(nthreads_per_block,1,1), 
                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
-        cuda
         
     def compute_bandwidth(self, event_hit, event_time, event_charge, 
                           scale_factor=1.0):
@@ -177,7 +177,7 @@ class GPUKernelPDF(object):
 class GPUPDF(object):
     def __init__(self):
         self.module = get_cu_module('pdf.cu', options=cuda_options,
-                                    include_source_directory=False)
+                                    include_source_directory=True)
         self.gpu_funcs = GPUFuncs(self.module)
 
     def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
@@ -259,21 +259,31 @@ class GPUPDF(object):
             time_only: bool
               If True, only the time observable will be used in the PDF.
         """
+        self.event_nhit = np.count_nonzero(event_hit)
+        
+        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
+        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32)
+        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id)
+        self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32)
+        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset)
+
         self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
         self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
         self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
 
         self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
         self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
-        self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, 
+        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, 
                                              dtype=np.float32)
         self.nearest_mc_gpu.fill(1e9)
-
+        
         self.min_twidth = min_twidth
         self.trange = trange
         self.min_qwidth = min_qwidth
         self.qrange = qrange
         self.min_bin_content = min_bin_content
+
+        assert time_only # Only support time right now
         self.time_only = time_only
 
     def clear_pdf_eval(self):
@@ -282,27 +292,38 @@ class GPUPDF(object):
         self.eval_bincount_gpu.fill(0)
         self.nearest_mc_gpu.fill(1e9)
 
-    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+    @profile_if_possible
+    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000):
         "Add the most recent results of run_daq() to the PDF evaluation."
-        self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
-                                           np.int32(len(self.event_hit_gpu)),
+        self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32)
+        self.work_queues.fill(1)
+
+        self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size),
+                                           np.int32(gpuchannels.ndaq),
                                            self.event_hit_gpu,
                                            self.event_time_gpu,
-                                           self.event_charge_gpu,
                                            gpuchannels.t,
-                                           gpuchannels.q,
                                            self.eval_hitcount_gpu,
                                            self.eval_bincount_gpu,
                                            np.float32(self.min_twidth),
                                            np.float32(self.trange[0]),
                                            np.float32(self.trange[1]),
-                                           np.float32(self.min_qwidth),
-                                           np.float32(self.qrange[0]),
-                                           np.float32(self.qrange[1]),
-                                           self.nearest_mc_gpu,
                                            np.int32(self.min_bin_content),
+                                           self.map_channel_id_to_hit_offset_gpu,
+                                           self.work_queues,
                                            block=(nthreads_per_block,1,1), 
-                                           grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+                                           grid=(self.event_hit_gpu.size//nthreads_per_block+1,1))
+        self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit),
+                                                         np.int32(gpuchannels.ndaq),
+                                                         self.map_hit_offset_to_channel_id_gpu,
+                                                         self.work_queues,
+                                                         self.event_time_gpu,
+                                                         gpuchannels.t,
+                                                         self.nearest_mc_gpu,
+                                                         np.int32(self.min_bin_content),
+                                                         block=(nthreads_per_block,1,1), 
+                                                         grid=(self.event_nhit,1))
+        cuda.Context.get_current().synchronize()
 
     def get_pdf_eval(self):
         evhit = self.event_hit_gpu.get().astype(bool)
@@ -325,14 +346,16 @@ class GPUPDF(object):
         # PDF value for low stats bins
         low_stats = ~high_stats & (hitcount > 0) & evhit
 
-        nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
+        nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content))
+        nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32)
+        nearest_mc.fill(1e9)
+        nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit
 
         # Deal with the case where we did not even get min_bin_content events
         # in the PDF but also clamp the lower range to ensure we don't index
         # by a negative number in 2 lines
         last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
         distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
-
         if low_stats.any():
             if self.time_only:
                 pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
@@ -344,5 +367,4 @@ class GPUPDF(object):
         # PDFs with no stats got zero by default during array creation
         
         print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
-
         return hitcount, pdf_value, pdf_value * pdf_frac_uncert
diff --git a/chroma/likelihood.py b/chroma/likelihood.py
index 5e8e678..a29dc6c 100644
--- a/chroma/likelihood.py
+++ b/chroma/likelihood.py
@@ -57,11 +57,12 @@ class Likelihood(object):
         hitcount, pdf_prob, pdf_prob_uncert = \
             self.sim.eval_pdf(self.event.channels,
                               vertex_generator,
-                              0.1, self.trange, 
+                              0.2, self.trange, 
                               1, self.qrange,
                               nreps=nreps,
                               ndaq=ndaq,
-                              time_only=self.time_only)
+                              time_only=self.time_only,
+                              min_bin_content=ndaq*10)
         
         # Normalize probabilities and put a floor to keep the log finite
         hit_prob = hitcount.astype(np.float32) / ntotal
@@ -80,7 +81,6 @@ class Likelihood(object):
 
         return hit_prob, pdf_prob, pdf_prob_uncert
         
-    @profile_if_possible
     def eval(self, vertex_generator, nevals, nreps=16, ndaq=50):
         """
         Return the negative log likelihood that the detector event set in the
@@ -100,9 +100,8 @@ class Likelihood(object):
 
         # Then include the probability densities of the observed
         # charges and times.
-        hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit],
-                                        pdf_prob_uncert[self.event.channels.hit]))
-        log_likelihood += unumpy.log(hit_pdf_ufloat).sum()
+        log_likelihood += ufloat((np.log(pdf_prob[self.event.channels.hit]).sum(),
+                                  0.0))
         
         return -log_likelihood
 
diff --git a/chroma/sim.py b/chroma/sim.py
index 93bcdfb..a82f1f8 100644
--- a/chroma/sim.py
+++ b/chroma/sim.py
@@ -9,13 +9,14 @@ from chroma import event
 from chroma import itertoolset
 from chroma.tools import profile_if_possible
 
+import pycuda.driver as cuda
+
 def pick_seed():
     """Returns a seed for a random number generator selected using
     a mixture of the current time and the current process ID."""
     return int(time.time()) ^ (os.getpid() << 16)
 
 class Simulation(object):
-    @profile_if_possible
     def __init__(self, detector, seed=None, cuda_device=None,
                  geant4_processes=4, nthreads_per_block=64, max_blocks=1024):
         self.detector = detector
@@ -122,9 +123,12 @@ class Simulation(object):
         
         return self.gpu_pdf.get_pdfs()
 
-    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=50, nreps=1, ndaq=1, time_only=True):
+    @profile_if_possible
+    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=100, nreps=1, ndaq=1, time_only=True):
         """Returns tuple: 1D array of channel hit counts, 1D array of PDF
         probability densities."""
+        gpu_daq = gpu.GPUDaq(self.gpu_geometry, ndaq=ndaq)
+
         self.gpu_pdf.setup_pdf_eval(event_channels.hit,
                                     event_channels.t,
                                     event_channels.q,
@@ -145,10 +149,16 @@ class Simulation(object):
             gpu_photons.propagate(self.gpu_geometry, self.rng_states,
                                   nthreads_per_block=self.nthreads_per_block,
                                   max_blocks=self.max_blocks)
-            for gpu_photon_slice in gpu_photons.iterate_copies():
-                for idaq in xrange(ndaq):
-                    gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
-                    self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
+            nphotons = gpu_photons.true_nphotons
+            for i in xrange(gpu_photons.ncopies):
+                start_photon = i * nphotons
+                gpu_photon_slice = gpu_photons.select(event.SURFACE_DETECT,
+                                                           start_photon=start_photon,
+                                                           nphotons=nphotons)
+                if len(gpu_photon_slice) == 0:
+                    continue
+                gpu_channels = gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+                self.gpu_pdf.accumulate_pdf_eval(gpu_channels, 32)
         
         return self.gpu_pdf.get_pdf_eval()
author	Stan Seibert <stan@mtrr.org>	2011-10-21 17:02:11 -0400
committer	tlatorre <tlatorre@uchicago.edu>	2021-05-09 08:42:38 -0700
commit	a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d (patch)
tree	e7ca82bad70c395689ccd4284ff8e527bfa54c8e
parent	ce45e90e90f801bf7b523c97399b1986c8bfa97e (diff)
download	chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.gz chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.bz2 chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.zip