Major overhaul to the way that DAQ and PDF information is accumulated

to speed up likelihood evaluation. When generating a likelihood, the DAQ can be run many times in parallel by the GPU, creating a large block of channel hit information in memory. The PDF accumulator processes that entire block in two passes: * First update the channel hit count, and the count of channel hits falling into the bin around the channel hit time being evaluated. Add any channel hits that should also be included in the n-th nearest neighbor calculation to a channel-specific work queue. * Process all the work queues for each channel and update the list of nearest neighbors. This is hugely faster than what we were doing before. Kernel estimation (or some kind of orthogonal function expansion of the PDF) should be better ultimately, but for now the nearest neighbor approach to PDF estimation seems to be working the best.
author: Stan Seibert <stan@mtrr.org> 2011-10-21 17:02:11 -0400
committer: tlatorre <tlatorre@uchicago.edu> 2021-05-09 08:42:38 -0700
commit: a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d (patch)
tree: e7ca82bad70c395689ccd4284ff8e527bfa54c8e /chroma/cuda
parent: ce45e90e90f801bf7b523c97399b1986c8bfa97e (diff)
download: chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.gz
chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.bz2
chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.zip
2 files changed, 224 insertions, 68 deletions
diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu
index b995bfb..3ed402d 100644
--- a/chroma/cuda/daq.cu
+++ b/chroma/cuda/daq.cu
@@ -79,6 +79,65 @@ run_daq(curandState *s, unsigned int detection_state,
 }
 
 __global__ void
+run_daq_many(curandState *s, unsigned int detection_state,
+	     int first_photon, int nphotons, float *photon_times,
+	     unsigned int *photon_histories, int *last_hit_triangles,
+	     int *solid_map,
+	     Detector *detector,
+	     unsigned int *earliest_time_int,
+	     unsigned int *channel_q_int, unsigned int *channel_histories,
+	     int ndaq, int channel_stride)
+{
+    __shared__ int photon_id;
+    __shared__ int triangle_id;
+    __shared__ int solid_id;
+    __shared__ int channel_index;
+    __shared__ unsigned int history;
+    __shared__ float photon_time;
+    
+
+    if (threadIdx.x == 0) {
+	photon_id = first_photon + blockIdx.x;
+	triangle_id = last_hit_triangles[photon_id];
+	
+	if (triangle_id > -1) {
+	    solid_id = solid_map[triangle_id];
+	    history = photon_histories[photon_id];
+	    channel_index = detector->solid_id_to_channel_index[solid_id];
+	    photon_time = photon_times[photon_id];
+	}
+    }
+
+    __syncthreads();
+
+    if (triangle_id == -1 || channel_index < 0 || !(history & detection_state))
+      return;
+
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    curandState rng = s[id];
+
+    for (int i = threadIdx.x; i < ndaq; i += blockDim.x) {
+	int channel_offset = channel_index + i * channel_stride;
+
+	float time = photon_time + curand_normal(&rng) * 1.2f;// + 
+	//sample_cdf(&rng, detector->time_cdf_len,
+	//	       detector->time_cdf_x, detector->time_cdf_y);
+	unsigned int time_int = float_to_sortable_int(time);
+	
+	float charge = 1.0f; //sample_cdf(&rng, detector->charge_cdf_len,
+	//detector->charge_cdf_x,
+	//detector->charge_cdf_y);
+	unsigned int charge_int = roundf(charge / detector->charge_unit);
+	
+	atomicMin(earliest_time_int + channel_offset, time_int);
+	atomicAdd(channel_q_int + channel_offset, charge_int);
+	atomicOr(channel_histories + channel_offset, history);
+    }
+
+    s[id] = rng;
+}
+
+__global__ void
 convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
 			      float *float_output)
 {
diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu
index bf42742..a9a9339 100644
--- a/chroma/cuda/pdf.cu
+++ b/chroma/cuda/pdf.cu
@@ -1,6 +1,8 @@
 // -*-c++-*-
 #include <curand_kernel.h>
 
+#include "sorting.h"
+
 extern "C"
 {
 
@@ -28,101 +30,196 @@ bin_hits(int nchannels, float *channel_q, float *channel_time,
 	pdf[bin] += 1;
     }
 }
-	
+
 __global__ void
-accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
-		    float *event_time, float *event_charge, float *mc_time,
-		    float *mc_charge, // quirk of DAQ!
+accumulate_bincount(int nchannels,
+		    int ndaq,
+		    unsigned int *event_hit,
+		    float *event_time, 
+		    float *mc_time,
 		    unsigned int *hitcount, unsigned int *bincount,
 		    float min_twidth, float tmin, float tmax,
-		    float min_qwidth, float qmin, float qmax,
-		    float *nearest_mc, int min_bin_content)
+		    int min_bin_content,
+		    unsigned int *map_channel_id_to_hit_offset,
+		    unsigned int *work_queues)
 {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    int channel_id = threadIdx.x + blockDim.x * blockIdx.x;
 	
-    if (id >= nchannels)
+    if (channel_id >= nchannels)
 	return;
+
+    float channel_hitcount = hitcount[channel_id];
+    float channel_bincount = bincount[channel_id];
+    float channel_event_time = event_time[channel_id];
+    int channel_event_hit = event_hit[channel_id];
+    
+    unsigned int *work_queue = work_queues + map_channel_id_to_hit_offset[channel_id] * (ndaq + 1);
+    unsigned int next_slot = work_queue[0];
 	
-    // Was this channel hit in the Monte Carlo?
-    float channel_mc_time = mc_time[id];
-    if (channel_mc_time >= 1e8)
-	return; // Nothing else to do
+    for (int i=0; i < ndaq; i++) {
+	int read_offset = nchannels * i + channel_id;
+
+	// Was this channel hit in the Monte Carlo?
+	float channel_mc_time = mc_time[read_offset];
+	if (channel_mc_time >= 1e8)
+	    continue; // Nothing else to do
 	
-    // Is this channel inside the range of the PDF?
-    float distance;
-    int channel_bincount = bincount[id];
-    if (time_only) {
+	// Is this channel inside the range of the PDF?
+	float distance;
 	if (channel_mc_time < tmin || channel_mc_time > tmax)
-	    return;  // Nothing else to do
+	    continue;  // Nothing else to do
 		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
+	channel_hitcount += 1;
+
 	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
 	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
+	    continue; // No need to update PDF value for unhit channel
+    
 	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
 	distance = fabsf(channel_mc_time - channel_event_time);
-	if (distance < min_twidth/2.0)
-	    bincount[id] = channel_bincount + 1;
+	if (distance < min_twidth/2.0) {
+	    channel_bincount += 1;
+	}
 
+	// Add this hit to the work queue if we also need to sort it into the 
+	// nearest_mc_list
+	if (channel_bincount < min_bin_content) {
+	    work_queue[next_slot] = read_offset;
+	    next_slot++;
+	}
     }
-    else { // time and charge PDF
-	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+
+    hitcount[channel_id] = channel_hitcount;
+    bincount[channel_id] = channel_bincount;
+    if (channel_event_hit)
+	work_queue[0] = next_slot;
+}
 	
-	if (channel_mc_time < tmin || channel_mc_time > tmax ||
-	    channel_mc_charge < qmin || channel_mc_charge > qmax)
-	    return;  // Nothing else to do
-		
-	// This MC information is contained in the PDF
-	hitcount[id] += 1;
-	  
-	// Was this channel hit in the event-of-interest?
-	int channel_event_hit = event_hit[id];
-	if (!channel_event_hit)
-	    return; // No need to update PDF value for unhit channel
-		
-	// Are we inside the minimum size bin?
-	float channel_event_time = event_time[id];
-	float channel_event_charge = event_charge[id];
-	float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
-	float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
-	distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+__global__ void
+accumulate_nearest_neighbor(int nhit,
+			    int ndaq,
+			    unsigned int *map_hit_offset_to_channel_id,
+                            unsigned int *work_queues,
+			    float *event_time,
+			    float *mc_time,
+			    float *nearest_mc, int min_bin_content)
+{
+    int hit_id = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    if (hit_id >= nhit)
+	return;
+
+    unsigned int *work_queue = work_queues + hit_id * (ndaq + 1);
+    int queue_items = work_queue[0] - 1;
+
+    int channel_id = map_hit_offset_to_channel_id[hit_id];
+    float channel_event_time = event_time[channel_id];
+    
+    float distance_table[1000];
+    int distance_table_len = 0;
+
+    // Load existing distance table
+    int offset = min_bin_content * hit_id;    
+    for (int i=0; i < min_bin_content; i++) {
+      float d = nearest_mc[offset + i];
+      if (d > 1e8)
+	break;
 
-	if (distance < 1.0f)
-	    bincount[id] = channel_bincount + 1;
+      distance_table[distance_table_len] = d;
+      distance_table_len++;
     }
+    
+    // append new entries
+    for (int i=0; i < queue_items; i++) {
+	unsigned int read_offset = work_queue[i+1];
+	float channel_mc_time = mc_time[read_offset];
+	float distance = fabsf(channel_mc_time - channel_event_time);
 
-    // Do we need to keep updating the nearest_mc list?
-    if (channel_bincount >= min_bin_content)
-      return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+	distance_table[distance_table_len] = distance;
+	distance_table_len++;
+    }
 
-    // insertion sort the distance into the array of nearest MC points
-    int offset = min_bin_content * id;
-    int entry = min_bin_content - 1;
-	
-    // If last entry less than new entry, nothing to update
-    if (distance > nearest_mc[offset + entry])
-	return;
+    // Sort table
+    piksrt(distance_table_len, distance_table);
+    
+    // Copy first section of table back out to global memory
+    distance_table_len = min(distance_table_len, min_bin_content);
+    for (int i=0; i < distance_table_len; i++) {
+      nearest_mc[offset + i] = distance_table[i];
+    }
+}
+
+__global__ void
+accumulate_nearest_neighbor_block(int nhit,
+			    int ndaq,
+			    unsigned int *map_hit_offset_to_channel_id,
+                            unsigned int *work_queues,
+			    float *event_time,
+			    float *mc_time,
+			    float *nearest_mc, int min_bin_content)
+{
+    int hit_id = blockIdx.x;
+    
+    __shared__ float distance_table[1000];
+    __shared__ unsigned int *work_queue;
+    __shared__ int queue_items;
+    __shared__ int channel_id;
+    __shared__ float channel_event_time;
+    __shared__ int distance_table_len;
+    __shared__ int offset;
+
+    if (threadIdx.x == 0) {
+      work_queue = work_queues + hit_id * (ndaq + 1);
+      queue_items = work_queue[0] - 1;
+
+      channel_id = map_hit_offset_to_channel_id[hit_id];
+      channel_event_time = event_time[channel_id];
+      distance_table_len = min_bin_content;
+      offset = min_bin_content * hit_id;    
+    }
+
+    __syncthreads();
 
-    // Find where to insert the new entry while sliding the rest
-    // to the right
-    entry--;
-    while (entry >= 0) {
-	if (nearest_mc[offset+entry] >= distance)
-	    nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
-	else
-	    break;
-	entry--;
+    // Load existing distance table
+    for (int i=threadIdx.x; i < min_bin_content; i += blockDim.x) {
+      float d = nearest_mc[offset + i];
+      if (d > 1e8) {
+	atomicMin(&distance_table_len, i);
+	break;
+      }
+      distance_table[i] = d;
     }
+    
+    __syncthreads();
 
-    nearest_mc[offset+entry+1] = distance;
+    // append new entries
+    for (int i=threadIdx.x; i < queue_items; i += blockDim.x) {
+	unsigned int read_offset = work_queue[i+1];
+	float channel_mc_time = mc_time[read_offset];
+	float distance = fabsf(channel_mc_time - channel_event_time);
+
+	distance_table[distance_table_len + i] = distance;
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      distance_table_len += queue_items;
+      // Sort table
+      piksrt(distance_table_len, distance_table);
+      // Copy first section of table back out to global memory
+      distance_table_len = min(distance_table_len, min_bin_content);
+    }
+    
+    __syncthreads();
+
+    for (int i=threadIdx.x; i < distance_table_len; i += blockDim.x) {
+      nearest_mc[offset + i] = distance_table[i];
+    }
 }
 
 
+
 __global__ void
 accumulate_moments(int time_only, int nchannels,
 		   float *mc_time,
author	Stan Seibert <stan@mtrr.org>	2011-10-21 17:02:11 -0400
committer	tlatorre <tlatorre@uchicago.edu>	2021-05-09 08:42:38 -0700
commit	a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d (patch)
tree	e7ca82bad70c395689ccd4284ff8e527bfa54c8e /chroma/cuda
parent	ce45e90e90f801bf7b523c97399b1986c8bfa97e (diff)
download	chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.gz chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.bz2 chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.zip