diff options
author | Stan Seibert <stan@mtrr.org> | 2011-10-21 17:02:11 -0400 |
---|---|---|
committer | tlatorre <tlatorre@uchicago.edu> | 2021-05-09 08:42:38 -0700 |
commit | a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d (patch) | |
tree | e7ca82bad70c395689ccd4284ff8e527bfa54c8e | |
parent | ce45e90e90f801bf7b523c97399b1986c8bfa97e (diff) | |
download | chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.gz chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.tar.bz2 chroma-a6ceb716ddcb6a7f5dc183ca63f0b258fc90fc2d.zip |
Major overhaul to the way that DAQ and PDF information is accumulated
to speed up likelihood evaluation.
When generating a likelihood, the DAQ can be run many times in parallel
by the GPU, creating a large block of channel hit information in memory.
The PDF accumulator processes that entire block in two passes:
* First update the channel hit count, and the count of channel hits
falling into the bin around the channel hit time being evaluated.
Add any channel hits that should also be included in the n-th
nearest neighbor calculation to a channel-specific work queue.
* Process all the work queues for each channel and update the
list of nearest neighbors.
This is hugely faster than what we were doing before. Kernel
estimation (or some kind of orthogonal function expansion of the PDF)
should be better ultimately, but for now the nearest neighbor approach
to PDF estimation seems to be working the best.
-rwxr-xr-x | chroma/benchmark.py | 12 | ||||
-rw-r--r-- | chroma/cuda/daq.cu | 59 | ||||
-rw-r--r-- | chroma/cuda/pdf.cu | 233 | ||||
-rw-r--r-- | chroma/gpu/daq.py | 74 | ||||
-rw-r--r-- | chroma/gpu/pdf.py | 60 | ||||
-rw-r--r-- | chroma/likelihood.py | 11 | ||||
-rw-r--r-- | chroma/sim.py | 22 |
7 files changed, 349 insertions, 122 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py index 44cf44b..06a8eba 100755 --- a/chroma/benchmark.py +++ b/chroma/benchmark.py @@ -146,6 +146,7 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1, return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times))) +@tools.profile_if_possible def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, nthreads_per_block=64, max_blocks=1024): """ @@ -179,6 +180,7 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get() # Setup PDF evaluation + gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq) gpu_pdf = gpu.GPUPDF() gpu_pdf.setup_pdf_eval(data_ev_channels.hit, data_ev_channels.t, @@ -205,10 +207,10 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, gpu_photons.propagate(gpu_detector, rng_states, nthreads_per_block, max_blocks) for gpu_photon_slice in gpu_photons.iterate_copies(): - for idaq in xrange(ndaq): - gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, - nthreads_per_block, max_blocks) - gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block) + gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT) + gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, + nthreads_per_block, max_blocks) + gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block) cuda.Context.get_current().synchronize() elapsed = time.time() - t0 @@ -224,6 +226,8 @@ if __name__ == '__main__': from chroma import demo import gc + tools.enable_debug_on_crash() + # Default to run all tests tests = ['ray', 'load', 'propagate', 'pdf', 'pdf_eval'] if len(sys.argv) > 1: diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu index b995bfb..3ed402d 100644 --- a/chroma/cuda/daq.cu +++ b/chroma/cuda/daq.cu @@ -79,6 +79,65 @@ run_daq(curandState *s, unsigned int detection_state, } __global__ void +run_daq_many(curandState *s, unsigned int detection_state, + int first_photon, int nphotons, float *photon_times, + unsigned int *photon_histories, int *last_hit_triangles, + int *solid_map, + Detector *detector, + unsigned int *earliest_time_int, + unsigned int *channel_q_int, unsigned int *channel_histories, + int ndaq, int channel_stride) +{ + __shared__ int photon_id; + __shared__ int triangle_id; + __shared__ int solid_id; + __shared__ int channel_index; + __shared__ unsigned int history; + __shared__ float photon_time; + + + if (threadIdx.x == 0) { + photon_id = first_photon + blockIdx.x; + triangle_id = last_hit_triangles[photon_id]; + + if (triangle_id > -1) { + solid_id = solid_map[triangle_id]; + history = photon_histories[photon_id]; + channel_index = detector->solid_id_to_channel_index[solid_id]; + photon_time = photon_times[photon_id]; + } + } + + __syncthreads(); + + if (triangle_id == -1 || channel_index < 0 || !(history & detection_state)) + return; + + int id = threadIdx.x + blockDim.x * blockIdx.x; + curandState rng = s[id]; + + for (int i = threadIdx.x; i < ndaq; i += blockDim.x) { + int channel_offset = channel_index + i * channel_stride; + + float time = photon_time + curand_normal(&rng) * 1.2f;// + + //sample_cdf(&rng, detector->time_cdf_len, + // detector->time_cdf_x, detector->time_cdf_y); + unsigned int time_int = float_to_sortable_int(time); + + float charge = 1.0f; //sample_cdf(&rng, detector->charge_cdf_len, + //detector->charge_cdf_x, + //detector->charge_cdf_y); + unsigned int charge_int = roundf(charge / detector->charge_unit); + + atomicMin(earliest_time_int + channel_offset, time_int); + atomicAdd(channel_q_int + channel_offset, charge_int); + atomicOr(channel_histories + channel_offset, history); + } + + s[id] = rng; +} + +__global__ void convert_sortable_int_to_float(int n, unsigned int *sortable_ints, float *float_output) { diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu index bf42742..a9a9339 100644 --- a/chroma/cuda/pdf.cu +++ b/chroma/cuda/pdf.cu @@ -1,6 +1,8 @@ // -*-c++-*- #include <curand_kernel.h> +#include "sorting.h" + extern "C" { @@ -28,101 +30,196 @@ bin_hits(int nchannels, float *channel_q, float *channel_time, pdf[bin] += 1; } } - + __global__ void -accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit, - float *event_time, float *event_charge, float *mc_time, - float *mc_charge, // quirk of DAQ! +accumulate_bincount(int nchannels, + int ndaq, + unsigned int *event_hit, + float *event_time, + float *mc_time, unsigned int *hitcount, unsigned int *bincount, float min_twidth, float tmin, float tmax, - float min_qwidth, float qmin, float qmax, - float *nearest_mc, int min_bin_content) + int min_bin_content, + unsigned int *map_channel_id_to_hit_offset, + unsigned int *work_queues) { - int id = threadIdx.x + blockDim.x * blockIdx.x; + int channel_id = threadIdx.x + blockDim.x * blockIdx.x; - if (id >= nchannels) + if (channel_id >= nchannels) return; + + float channel_hitcount = hitcount[channel_id]; + float channel_bincount = bincount[channel_id]; + float channel_event_time = event_time[channel_id]; + int channel_event_hit = event_hit[channel_id]; + + unsigned int *work_queue = work_queues + map_channel_id_to_hit_offset[channel_id] * (ndaq + 1); + unsigned int next_slot = work_queue[0]; - // Was this channel hit in the Monte Carlo? - float channel_mc_time = mc_time[id]; - if (channel_mc_time >= 1e8) - return; // Nothing else to do + for (int i=0; i < ndaq; i++) { + int read_offset = nchannels * i + channel_id; + + // Was this channel hit in the Monte Carlo? + float channel_mc_time = mc_time[read_offset]; + if (channel_mc_time >= 1e8) + continue; // Nothing else to do - // Is this channel inside the range of the PDF? - float distance; - int channel_bincount = bincount[id]; - if (time_only) { + // Is this channel inside the range of the PDF? + float distance; if (channel_mc_time < tmin || channel_mc_time > tmax) - return; // Nothing else to do + continue; // Nothing else to do - // This MC information is contained in the PDF - hitcount[id] += 1; - + channel_hitcount += 1; + // Was this channel hit in the event-of-interest? - int channel_event_hit = event_hit[id]; if (!channel_event_hit) - return; // No need to update PDF value for unhit channel - + continue; // No need to update PDF value for unhit channel + // Are we inside the minimum size bin? - float channel_event_time = event_time[id]; distance = fabsf(channel_mc_time - channel_event_time); - if (distance < min_twidth/2.0) - bincount[id] = channel_bincount + 1; + if (distance < min_twidth/2.0) { + channel_bincount += 1; + } + // Add this hit to the work queue if we also need to sort it into the + // nearest_mc_list + if (channel_bincount < min_bin_content) { + work_queue[next_slot] = read_offset; + next_slot++; + } } - else { // time and charge PDF - float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer + + hitcount[channel_id] = channel_hitcount; + bincount[channel_id] = channel_bincount; + if (channel_event_hit) + work_queue[0] = next_slot; +} - if (channel_mc_time < tmin || channel_mc_time > tmax || - channel_mc_charge < qmin || channel_mc_charge > qmax) - return; // Nothing else to do - - // This MC information is contained in the PDF - hitcount[id] += 1; - - // Was this channel hit in the event-of-interest? - int channel_event_hit = event_hit[id]; - if (!channel_event_hit) - return; // No need to update PDF value for unhit channel - - // Are we inside the minimum size bin? - float channel_event_time = event_time[id]; - float channel_event_charge = event_charge[id]; - float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0; - float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0; - distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance); +__global__ void +accumulate_nearest_neighbor(int nhit, + int ndaq, + unsigned int *map_hit_offset_to_channel_id, + unsigned int *work_queues, + float *event_time, + float *mc_time, + float *nearest_mc, int min_bin_content) +{ + int hit_id = threadIdx.x + blockDim.x * blockIdx.x; + + if (hit_id >= nhit) + return; + + unsigned int *work_queue = work_queues + hit_id * (ndaq + 1); + int queue_items = work_queue[0] - 1; + + int channel_id = map_hit_offset_to_channel_id[hit_id]; + float channel_event_time = event_time[channel_id]; + + float distance_table[1000]; + int distance_table_len = 0; + + // Load existing distance table + int offset = min_bin_content * hit_id; + for (int i=0; i < min_bin_content; i++) { + float d = nearest_mc[offset + i]; + if (d > 1e8) + break; - if (distance < 1.0f) - bincount[id] = channel_bincount + 1; + distance_table[distance_table_len] = d; + distance_table_len++; } + + // append new entries + for (int i=0; i < queue_items; i++) { + unsigned int read_offset = work_queue[i+1]; + float channel_mc_time = mc_time[read_offset]; + float distance = fabsf(channel_mc_time - channel_event_time); - // Do we need to keep updating the nearest_mc list? - if (channel_bincount >= min_bin_content) - return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value + distance_table[distance_table_len] = distance; + distance_table_len++; + } - // insertion sort the distance into the array of nearest MC points - int offset = min_bin_content * id; - int entry = min_bin_content - 1; - - // If last entry less than new entry, nothing to update - if (distance > nearest_mc[offset + entry]) - return; + // Sort table + piksrt(distance_table_len, distance_table); + + // Copy first section of table back out to global memory + distance_table_len = min(distance_table_len, min_bin_content); + for (int i=0; i < distance_table_len; i++) { + nearest_mc[offset + i] = distance_table[i]; + } +} + +__global__ void +accumulate_nearest_neighbor_block(int nhit, + int ndaq, + unsigned int *map_hit_offset_to_channel_id, + unsigned int *work_queues, + float *event_time, + float *mc_time, + float *nearest_mc, int min_bin_content) +{ + int hit_id = blockIdx.x; + + __shared__ float distance_table[1000]; + __shared__ unsigned int *work_queue; + __shared__ int queue_items; + __shared__ int channel_id; + __shared__ float channel_event_time; + __shared__ int distance_table_len; + __shared__ int offset; + + if (threadIdx.x == 0) { + work_queue = work_queues + hit_id * (ndaq + 1); + queue_items = work_queue[0] - 1; + + channel_id = map_hit_offset_to_channel_id[hit_id]; + channel_event_time = event_time[channel_id]; + distance_table_len = min_bin_content; + offset = min_bin_content * hit_id; + } + + __syncthreads(); - // Find where to insert the new entry while sliding the rest - // to the right - entry--; - while (entry >= 0) { - if (nearest_mc[offset+entry] >= distance) - nearest_mc[offset+entry+1] = nearest_mc[offset+entry]; - else - break; - entry--; + // Load existing distance table + for (int i=threadIdx.x; i < min_bin_content; i += blockDim.x) { + float d = nearest_mc[offset + i]; + if (d > 1e8) { + atomicMin(&distance_table_len, i); + break; + } + distance_table[i] = d; } + + __syncthreads(); - nearest_mc[offset+entry+1] = distance; + // append new entries + for (int i=threadIdx.x; i < queue_items; i += blockDim.x) { + unsigned int read_offset = work_queue[i+1]; + float channel_mc_time = mc_time[read_offset]; + float distance = fabsf(channel_mc_time - channel_event_time); + + distance_table[distance_table_len + i] = distance; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + distance_table_len += queue_items; + // Sort table + piksrt(distance_table_len, distance_table); + // Copy first section of table back out to global memory + distance_table_len = min(distance_table_len, min_bin_content); + } + + __syncthreads(); + + for (int i=threadIdx.x; i < distance_table_len; i += blockDim.x) { + nearest_mc[offset + i] = distance_table[i]; + } } + __global__ void accumulate_moments(int time_only, int nchannels, float *mc_time, diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py index 571feba..ec5add4 100644 --- a/chroma/gpu/daq.py +++ b/chroma/gpu/daq.py @@ -1,15 +1,27 @@ import numpy as np from pycuda import gpuarray as ga +from pycuda import driver as cuda from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \ chunk_iterator from chroma import event class GPUChannels(object): - def __init__(self, t, q, flags): + def __init__(self, t, q, flags, ndaq=1, stride=None): self.t = t self.q = q self.flags = flags + self.ndaq = ndaq + if stride is None: + self.stride = len(t) + else: + self.stride = stride + + def iterate_copies(self): + for i in xrange(self.ndaq): + yield GPUChannels(self.t[i*self.stride:(i+1)*self.stride], + self.q[i*self.stride:(i+1)*self.stride], + self.flags[i*self.stride:(i+1)*self.stride]) def get(self): t = self.t.get() @@ -19,10 +31,13 @@ class GPUChannels(object): # enough hit time were hit. return event.Channels(t<1e8, t, q, self.flags.get()) + def __len__(self): + return self.t.size + class GPUDaq(object): - def __init__(self, gpu_detector): - self.earliest_time_gpu = ga.empty(gpu_detector.nchannels, dtype=np.float32) - self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels, dtype=np.uint32) + def __init__(self, gpu_detector, ndaq=1): + self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32) + self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) @@ -33,29 +48,50 @@ class GPUDaq(object): self.module = get_cu_module('daq.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) + self.ndaq = ndaq + self.stride = gpu_detector.nchannels - def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): - self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) + def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): + self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(64,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.channel_q_int_gpu.fill(0) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) - n = len(gpuphotons.pos) - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(n, nthreads_per_block, max_blocks): - self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), - np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, - gpuphotons.flags, gpuphotons.last_hit_triangles, - self.solid_id_map_gpu, - self.detector_gpu, - self.earliest_time_int_gpu, - self.channel_q_int_gpu, self.channel_history_gpu, - block=(nthreads_per_block,1,1), grid=(blocks,1)) + if start_photon is None: + start_photon = 0 + if nphotons is None: + nphotons = len(gpuphotons.pos) - start_photon + if self.ndaq == 1: + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), + np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, + gpuphotons.flags, gpuphotons.last_hit_triangles, + self.solid_id_map_gpu, + self.detector_gpu, + self.earliest_time_int_gpu, + self.channel_q_int_gpu, self.channel_history_gpu, + block=(nthreads_per_block,1,1), grid=(blocks,1)) + else: + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, 1, max_blocks): + self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), + np.int32(start_photon), np.int32(photons_this_round), gpuphotons.t, + gpuphotons.flags, gpuphotons.last_hit_triangles, + self.solid_id_map_gpu, + self.detector_gpu, + self.earliest_time_int_gpu, + self.channel_q_int_gpu, self.channel_history_gpu, + np.int32(self.ndaq), np.int32(self.stride), + block=(nthreads_per_block,1,1), grid=(blocks,1)) + cuda.Context.get_current().synchronize() + self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1)) - return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu) + cuda.Context.get_current().synchronize() + + return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride) diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py index bd3c448..1b6340f 100644 --- a/chroma/gpu/pdf.py +++ b/chroma/gpu/pdf.py @@ -1,12 +1,13 @@ import numpy as np from pycuda import gpuarray as ga import pycuda.driver as cuda -from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs +from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, chunk_iterator +from chroma.tools import profile_if_possible class GPUKernelPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, - include_source_directory=False) + include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_moments(self, nchannels, trange, qrange, time_only=True): @@ -56,7 +57,6 @@ class GPUKernelPDF(object): self.qmom2_gpu, block=(nthreads_per_block,1,1), grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) - cuda def compute_bandwidth(self, event_hit, event_time, event_charge, scale_factor=1.0): @@ -177,7 +177,7 @@ class GPUKernelPDF(object): class GPUPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, - include_source_directory=False) + include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, nchannels, tbins, trange, qbins, qrange): @@ -259,21 +259,31 @@ class GPUPDF(object): time_only: bool If True, only the time observable will be used in the PDF. """ + self.event_nhit = np.count_nonzero(event_hit) + + # Define a mapping from an array of len(event_hit) to an array of length event_nhit + self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32) + self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id) + self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32) + self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset) + self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) - self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, + self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) - + self.min_twidth = min_twidth self.trange = trange self.min_qwidth = min_qwidth self.qrange = qrange self.min_bin_content = min_bin_content + + assert time_only # Only support time right now self.time_only = time_only def clear_pdf_eval(self): @@ -282,27 +292,38 @@ class GPUPDF(object): self.eval_bincount_gpu.fill(0) self.nearest_mc_gpu.fill(1e9) - def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64): + @profile_if_possible + def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000): "Add the most recent results of run_daq() to the PDF evaluation." - self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only), - np.int32(len(self.event_hit_gpu)), + self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32) + self.work_queues.fill(1) + + self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size), + np.int32(gpuchannels.ndaq), self.event_hit_gpu, self.event_time_gpu, - self.event_charge_gpu, gpuchannels.t, - gpuchannels.q, self.eval_hitcount_gpu, self.eval_bincount_gpu, np.float32(self.min_twidth), np.float32(self.trange[0]), np.float32(self.trange[1]), - np.float32(self.min_qwidth), - np.float32(self.qrange[0]), - np.float32(self.qrange[1]), - self.nearest_mc_gpu, np.int32(self.min_bin_content), + self.map_channel_id_to_hit_offset_gpu, + self.work_queues, block=(nthreads_per_block,1,1), - grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) + grid=(self.event_hit_gpu.size//nthreads_per_block+1,1)) + self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit), + np.int32(gpuchannels.ndaq), + self.map_hit_offset_to_channel_id_gpu, + self.work_queues, + self.event_time_gpu, + gpuchannels.t, + self.nearest_mc_gpu, + np.int32(self.min_bin_content), + block=(nthreads_per_block,1,1), + grid=(self.event_nhit,1)) + cuda.Context.get_current().synchronize() def get_pdf_eval(self): evhit = self.event_hit_gpu.get().astype(bool) @@ -325,14 +346,16 @@ class GPUPDF(object): # PDF value for low stats bins low_stats = ~high_stats & (hitcount > 0) & evhit - nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content)) + nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content)) + nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32) + nearest_mc.fill(1e9) + nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit # Deal with the case where we did not even get min_bin_content events # in the PDF but also clamp the lower range to ensure we don't index # by a negative number in 2 lines last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] - if low_stats.any(): if self.time_only: pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0 @@ -344,5 +367,4 @@ class GPUPDF(object): # PDFs with no stats got zero by default during array creation print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() - return hitcount, pdf_value, pdf_value * pdf_frac_uncert diff --git a/chroma/likelihood.py b/chroma/likelihood.py index 5e8e678..a29dc6c 100644 --- a/chroma/likelihood.py +++ b/chroma/likelihood.py @@ -57,11 +57,12 @@ class Likelihood(object): hitcount, pdf_prob, pdf_prob_uncert = \ self.sim.eval_pdf(self.event.channels, vertex_generator, - 0.1, self.trange, + 0.2, self.trange, 1, self.qrange, nreps=nreps, ndaq=ndaq, - time_only=self.time_only) + time_only=self.time_only, + min_bin_content=ndaq*10) # Normalize probabilities and put a floor to keep the log finite hit_prob = hitcount.astype(np.float32) / ntotal @@ -80,7 +81,6 @@ class Likelihood(object): return hit_prob, pdf_prob, pdf_prob_uncert - @profile_if_possible def eval(self, vertex_generator, nevals, nreps=16, ndaq=50): """ Return the negative log likelihood that the detector event set in the @@ -100,9 +100,8 @@ class Likelihood(object): # Then include the probability densities of the observed # charges and times. - hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit], - pdf_prob_uncert[self.event.channels.hit])) - log_likelihood += unumpy.log(hit_pdf_ufloat).sum() + log_likelihood += ufloat((np.log(pdf_prob[self.event.channels.hit]).sum(), + 0.0)) return -log_likelihood diff --git a/chroma/sim.py b/chroma/sim.py index 93bcdfb..a82f1f8 100644 --- a/chroma/sim.py +++ b/chroma/sim.py @@ -9,13 +9,14 @@ from chroma import event from chroma import itertoolset from chroma.tools import profile_if_possible +import pycuda.driver as cuda + def pick_seed(): """Returns a seed for a random number generator selected using a mixture of the current time and the current process ID.""" return int(time.time()) ^ (os.getpid() << 16) class Simulation(object): - @profile_if_possible def __init__(self, detector, seed=None, cuda_device=None, geant4_processes=4, nthreads_per_block=64, max_blocks=1024): self.detector = detector @@ -122,9 +123,12 @@ class Simulation(object): return self.gpu_pdf.get_pdfs() - def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=50, nreps=1, ndaq=1, time_only=True): + @profile_if_possible + def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=100, nreps=1, ndaq=1, time_only=True): """Returns tuple: 1D array of channel hit counts, 1D array of PDF probability densities.""" + gpu_daq = gpu.GPUDaq(self.gpu_geometry, ndaq=ndaq) + self.gpu_pdf.setup_pdf_eval(event_channels.hit, event_channels.t, event_channels.q, @@ -145,10 +149,16 @@ class Simulation(object): gpu_photons.propagate(self.gpu_geometry, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) - for gpu_photon_slice in gpu_photons.iterate_copies(): - for idaq in xrange(ndaq): - gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) - self.gpu_pdf.accumulate_pdf_eval(gpu_channels) + nphotons = gpu_photons.true_nphotons + for i in xrange(gpu_photons.ncopies): + start_photon = i * nphotons + gpu_photon_slice = gpu_photons.select(event.SURFACE_DETECT, + start_photon=start_photon, + nphotons=nphotons) + if len(gpu_photon_slice) == 0: + continue + gpu_channels = gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) + self.gpu_pdf.accumulate_pdf_eval(gpu_channels, 32) return self.gpu_pdf.get_pdf_eval() |