diff options
-rwxr-xr-x | chroma/benchmark.py | 12 | ||||
-rw-r--r-- | chroma/cuda/daq.cu | 59 | ||||
-rw-r--r-- | chroma/cuda/pdf.cu | 233 | ||||
-rw-r--r-- | chroma/gpu/daq.py | 74 | ||||
-rw-r--r-- | chroma/gpu/pdf.py | 60 | ||||
-rw-r--r-- | chroma/likelihood.py | 11 | ||||
-rw-r--r-- | chroma/sim.py | 22 |
7 files changed, 349 insertions, 122 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py index 44cf44b..06a8eba 100755 --- a/chroma/benchmark.py +++ b/chroma/benchmark.py @@ -146,6 +146,7 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1, return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times))) +@tools.profile_if_possible def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, nthreads_per_block=64, max_blocks=1024): """ @@ -179,6 +180,7 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get() # Setup PDF evaluation + gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq) gpu_pdf = gpu.GPUPDF() gpu_pdf.setup_pdf_eval(data_ev_channels.hit, data_ev_channels.t, @@ -205,10 +207,10 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128, gpu_photons.propagate(gpu_detector, rng_states, nthreads_per_block, max_blocks) for gpu_photon_slice in gpu_photons.iterate_copies(): - for idaq in xrange(ndaq): - gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, - nthreads_per_block, max_blocks) - gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block) + gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT) + gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states, + nthreads_per_block, max_blocks) + gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block) cuda.Context.get_current().synchronize() elapsed = time.time() - t0 @@ -224,6 +226,8 @@ if __name__ == '__main__': from chroma import demo import gc + tools.enable_debug_on_crash() + # Default to run all tests tests = ['ray', 'load', 'propagate', 'pdf', 'pdf_eval'] if len(sys.argv) > 1: diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu index b995bfb..3ed402d 100644 --- a/chroma/cuda/daq.cu +++ b/chroma/cuda/daq.cu @@ -79,6 +79,65 @@ run_daq(curandState *s, unsigned int detection_state, } __global__ void +run_daq_many(curandState *s, unsigned int detection_state, + int first_photon, int nphotons, float *photon_times, + unsigned int *photon_histories, int *last_hit_triangles, + int *solid_map, + Detector *detector, + unsigned int *earliest_time_int, + unsigned int *channel_q_int, unsigned int *channel_histories, + int ndaq, int channel_stride) +{ + __shared__ int photon_id; + __shared__ int triangle_id; + __shared__ int solid_id; + __shared__ int channel_index; + __shared__ unsigned int history; + __shared__ float photon_time; + + + if (threadIdx.x == 0) { + photon_id = first_photon + blockIdx.x; + triangle_id = last_hit_triangles[photon_id]; + + if (triangle_id > -1) { + solid_id = solid_map[triangle_id]; + history = photon_histories[photon_id]; + channel_index = detector->solid_id_to_channel_index[solid_id]; + photon_time = photon_times[photon_id]; + } + } + + __syncthreads(); + + if (triangle_id == -1 || channel_index < 0 || !(history & detection_state)) + return; + + int id = threadIdx.x + blockDim.x * blockIdx.x; + curandState rng = s[id]; + + for (int i = threadIdx.x; i < ndaq; i += blockDim.x) { + int channel_offset = channel_index + i * channel_stride; + + float time = photon_time + curand_normal(&rng) * 1.2f;// + + //sample_cdf(&rng, detector->time_cdf_len, + // detector->time_cdf_x, detector->time_cdf_y); + unsigned int time_int = float_to_sortable_int(time); + + float charge = 1.0f; //sample_cdf(&rng, detector->charge_cdf_len, + //detector->charge_cdf_x, + //detector->charge_cdf_y); + unsigned int charge_int = roundf(charge / detector->charge_unit); + + atomicMin(earliest_time_int + channel_offset, time_int); + atomicAdd(channel_q_int + channel_offset, charge_int); + atomicOr(channel_histories + channel_offset, history); + } + + s[id] = rng; +} + +__global__ void convert_sortable_int_to_float(int n, unsigned int *sortable_ints, float *float_output) { diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu index bf42742..a9a9339 100644 --- a/chroma/cuda/pdf.cu +++ b/chroma/cuda/pdf.cu @@ -1,6 +1,8 @@ // -*-c++-*- #include <curand_kernel.h> +#include "sorting.h" + extern "C" { @@ -28,101 +30,196 @@ bin_hits(int nchannels, float *channel_q, float *channel_time, pdf[bin] += 1; } } - + __global__ void -accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit, - float *event_time, float *event_charge, float *mc_time, - float *mc_charge, // quirk of DAQ! +accumulate_bincount(int nchannels, + int ndaq, + unsigned int *event_hit, + float *event_time, + float *mc_time, unsigned int *hitcount, unsigned int *bincount, float min_twidth, float tmin, float tmax, - float min_qwidth, float qmin, float qmax, - float *nearest_mc, int min_bin_content) + int min_bin_content, + unsigned int *map_channel_id_to_hit_offset, + unsigned int *work_queues) { - int id = threadIdx.x + blockDim.x * blockIdx.x; + int channel_id = threadIdx.x + blockDim.x * blockIdx.x; - if (id >= nchannels) + if (channel_id >= nchannels) return; + + float channel_hitcount = hitcount[channel_id]; + float channel_bincount = bincount[channel_id]; + float channel_event_time = event_time[channel_id]; + int channel_event_hit = event_hit[channel_id]; + + unsigned int *work_queue = work_queues + map_channel_id_to_hit_offset[channel_id] * (ndaq + 1); + unsigned int next_slot = work_queue[0]; - // Was this channel hit in the Monte Carlo? - float channel_mc_time = mc_time[id]; - if (channel_mc_time >= 1e8) - return; // Nothing else to do + for (int i=0; i < ndaq; i++) { + int read_offset = nchannels * i + channel_id; + + // Was this channel hit in the Monte Carlo? + float channel_mc_time = mc_time[read_offset]; + if (channel_mc_time >= 1e8) + continue; // Nothing else to do - // Is this channel inside the range of the PDF? - float distance; - int channel_bincount = bincount[id]; - if (time_only) { + // Is this channel inside the range of the PDF? + float distance; if (channel_mc_time < tmin || channel_mc_time > tmax) - return; // Nothing else to do + continue; // Nothing else to do - // This MC information is contained in the PDF - hitcount[id] += 1; - + channel_hitcount += 1; + // Was this channel hit in the event-of-interest? - int channel_event_hit = event_hit[id]; if (!channel_event_hit) - return; // No need to update PDF value for unhit channel - + continue; // No need to update PDF value for unhit channel + // Are we inside the minimum size bin? - float channel_event_time = event_time[id]; distance = fabsf(channel_mc_time - channel_event_time); - if (distance < min_twidth/2.0) - bincount[id] = channel_bincount + 1; + if (distance < min_twidth/2.0) { + channel_bincount += 1; + } + // Add this hit to the work queue if we also need to sort it into the + // nearest_mc_list + if (channel_bincount < min_bin_content) { + work_queue[next_slot] = read_offset; + next_slot++; + } } - else { // time and charge PDF - float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer + + hitcount[channel_id] = channel_hitcount; + bincount[channel_id] = channel_bincount; + if (channel_event_hit) + work_queue[0] = next_slot; +} - if (channel_mc_time < tmin || channel_mc_time > tmax || - channel_mc_charge < qmin || channel_mc_charge > qmax) - return; // Nothing else to do - - // This MC information is contained in the PDF - hitcount[id] += 1; - - // Was this channel hit in the event-of-interest? - int channel_event_hit = event_hit[id]; - if (!channel_event_hit) - return; // No need to update PDF value for unhit channel - - // Are we inside the minimum size bin? - float channel_event_time = event_time[id]; - float channel_event_charge = event_charge[id]; - float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0; - float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0; - distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance); +__global__ void +accumulate_nearest_neighbor(int nhit, + int ndaq, + unsigned int *map_hit_offset_to_channel_id, + unsigned int *work_queues, + float *event_time, + float *mc_time, + float *nearest_mc, int min_bin_content) +{ + int hit_id = threadIdx.x + blockDim.x * blockIdx.x; + + if (hit_id >= nhit) + return; + + unsigned int *work_queue = work_queues + hit_id * (ndaq + 1); + int queue_items = work_queue[0] - 1; + + int channel_id = map_hit_offset_to_channel_id[hit_id]; + float channel_event_time = event_time[channel_id]; + + float distance_table[1000]; + int distance_table_len = 0; + + // Load existing distance table + int offset = min_bin_content * hit_id; + for (int i=0; i < min_bin_content; i++) { + float d = nearest_mc[offset + i]; + if (d > 1e8) + break; - if (distance < 1.0f) - bincount[id] = channel_bincount + 1; + distance_table[distance_table_len] = d; + distance_table_len++; } + + // append new entries + for (int i=0; i < queue_items; i++) { + unsigned int read_offset = work_queue[i+1]; + float channel_mc_time = mc_time[read_offset]; + float distance = fabsf(channel_mc_time - channel_event_time); - // Do we need to keep updating the nearest_mc list? - if (channel_bincount >= min_bin_content) - return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value + distance_table[distance_table_len] = distance; + distance_table_len++; + } - // insertion sort the distance into the array of nearest MC points - int offset = min_bin_content * id; - int entry = min_bin_content - 1; - - // If last entry less than new entry, nothing to update - if (distance > nearest_mc[offset + entry]) - return; + // Sort table + piksrt(distance_table_len, distance_table); + + // Copy first section of table back out to global memory + distance_table_len = min(distance_table_len, min_bin_content); + for (int i=0; i < distance_table_len; i++) { + nearest_mc[offset + i] = distance_table[i]; + } +} + +__global__ void +accumulate_nearest_neighbor_block(int nhit, + int ndaq, + unsigned int *map_hit_offset_to_channel_id, + unsigned int *work_queues, + float *event_time, + float *mc_time, + float *nearest_mc, int min_bin_content) +{ + int hit_id = blockIdx.x; + + __shared__ float distance_table[1000]; + __shared__ unsigned int *work_queue; + __shared__ int queue_items; + __shared__ int channel_id; + __shared__ float channel_event_time; + __shared__ int distance_table_len; + __shared__ int offset; + + if (threadIdx.x == 0) { + work_queue = work_queues + hit_id * (ndaq + 1); + queue_items = work_queue[0] - 1; + + channel_id = map_hit_offset_to_channel_id[hit_id]; + channel_event_time = event_time[channel_id]; + distance_table_len = min_bin_content; + offset = min_bin_content * hit_id; + } + + __syncthreads(); - // Find where to insert the new entry while sliding the rest - // to the right - entry--; - while (entry >= 0) { - if (nearest_mc[offset+entry] >= distance) - nearest_mc[offset+entry+1] = nearest_mc[offset+entry]; - else - break; - entry--; + // Load existing distance table + for (int i=threadIdx.x; i < min_bin_content; i += blockDim.x) { + float d = nearest_mc[offset + i]; + if (d > 1e8) { + atomicMin(&distance_table_len, i); + break; + } + distance_table[i] = d; } + + __syncthreads(); - nearest_mc[offset+entry+1] = distance; + // append new entries + for (int i=threadIdx.x; i < queue_items; i += blockDim.x) { + unsigned int read_offset = work_queue[i+1]; + float channel_mc_time = mc_time[read_offset]; + float distance = fabsf(channel_mc_time - channel_event_time); + + distance_table[distance_table_len + i] = distance; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + distance_table_len += queue_items; + // Sort table + piksrt(distance_table_len, distance_table); + // Copy first section of table back out to global memory + distance_table_len = min(distance_table_len, min_bin_content); + } + + __syncthreads(); + + for (int i=threadIdx.x; i < distance_table_len; i += blockDim.x) { + nearest_mc[offset + i] = distance_table[i]; + } } + __global__ void accumulate_moments(int time_only, int nchannels, float *mc_time, diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py index 571feba..ec5add4 100644 --- a/chroma/gpu/daq.py +++ b/chroma/gpu/daq.py @@ -1,15 +1,27 @@ import numpy as np from pycuda import gpuarray as ga +from pycuda import driver as cuda from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \ chunk_iterator from chroma import event class GPUChannels(object): - def __init__(self, t, q, flags): + def __init__(self, t, q, flags, ndaq=1, stride=None): self.t = t self.q = q self.flags = flags + self.ndaq = ndaq + if stride is None: + self.stride = len(t) + else: + self.stride = stride + + def iterate_copies(self): + for i in xrange(self.ndaq): + yield GPUChannels(self.t[i*self.stride:(i+1)*self.stride], + self.q[i*self.stride:(i+1)*self.stride], + self.flags[i*self.stride:(i+1)*self.stride]) def get(self): t = self.t.get() @@ -19,10 +31,13 @@ class GPUChannels(object): # enough hit time were hit. return event.Channels(t<1e8, t, q, self.flags.get()) + def __len__(self): + return self.t.size + class GPUDaq(object): - def __init__(self, gpu_detector): - self.earliest_time_gpu = ga.empty(gpu_detector.nchannels, dtype=np.float32) - self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels, dtype=np.uint32) + def __init__(self, gpu_detector, ndaq=1): + self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32) + self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) @@ -33,29 +48,50 @@ class GPUDaq(object): self.module = get_cu_module('daq.cu', options=cuda_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) + self.ndaq = ndaq + self.stride = gpu_detector.nchannels - def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024): - self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) + def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): + self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(64,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.channel_q_int_gpu.fill(0) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) - n = len(gpuphotons.pos) - - for first_photon, photons_this_round, blocks in \ - chunk_iterator(n, nthreads_per_block, max_blocks): - self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), - np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t, - gpuphotons.flags, gpuphotons.last_hit_triangles, - self.solid_id_map_gpu, - self.detector_gpu, - self.earliest_time_int_gpu, - self.channel_q_int_gpu, self.channel_history_gpu, - block=(nthreads_per_block,1,1), grid=(blocks,1)) + if start_photon is None: + start_photon = 0 + if nphotons is None: + nphotons = len(gpuphotons.pos) - start_photon + if self.ndaq == 1: + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, nthreads_per_block, max_blocks): + self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), + np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, + gpuphotons.flags, gpuphotons.last_hit_triangles, + self.solid_id_map_gpu, + self.detector_gpu, + self.earliest_time_int_gpu, + self.channel_q_int_gpu, self.channel_history_gpu, + block=(nthreads_per_block,1,1), grid=(blocks,1)) + else: + for first_photon, photons_this_round, blocks in \ + chunk_iterator(nphotons, 1, max_blocks): + self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), + np.int32(start_photon), np.int32(photons_this_round), gpuphotons.t, + gpuphotons.flags, gpuphotons.last_hit_triangles, + self.solid_id_map_gpu, + self.detector_gpu, + self.earliest_time_int_gpu, + self.channel_q_int_gpu, self.channel_history_gpu, + np.int32(self.ndaq), np.int32(self.stride), + block=(nthreads_per_block,1,1), grid=(blocks,1)) + cuda.Context.get_current().synchronize() + self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1)) self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1)) - return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu) + cuda.Context.get_current().synchronize() + + return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride) diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py index bd3c448..1b6340f 100644 --- a/chroma/gpu/pdf.py +++ b/chroma/gpu/pdf.py @@ -1,12 +1,13 @@ import numpy as np from pycuda import gpuarray as ga import pycuda.driver as cuda -from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs +from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, chunk_iterator +from chroma.tools import profile_if_possible class GPUKernelPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, - include_source_directory=False) + include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_moments(self, nchannels, trange, qrange, time_only=True): @@ -56,7 +57,6 @@ class GPUKernelPDF(object): self.qmom2_gpu, block=(nthreads_per_block,1,1), grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) - cuda def compute_bandwidth(self, event_hit, event_time, event_charge, scale_factor=1.0): @@ -177,7 +177,7 @@ class GPUKernelPDF(object): class GPUPDF(object): def __init__(self): self.module = get_cu_module('pdf.cu', options=cuda_options, - include_source_directory=False) + include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, nchannels, tbins, trange, qbins, qrange): @@ -259,21 +259,31 @@ class GPUPDF(object): time_only: bool If True, only the time observable will be used in the PDF. """ + self.event_nhit = np.count_nonzero(event_hit) + + # Define a mapping from an array of len(event_hit) to an array of length event_nhit + self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32) + self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id) + self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32) + self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset) + self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) - self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content, + self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) - + self.min_twidth = min_twidth self.trange = trange self.min_qwidth = min_qwidth self.qrange = qrange self.min_bin_content = min_bin_content + + assert time_only # Only support time right now self.time_only = time_only def clear_pdf_eval(self): @@ -282,27 +292,38 @@ class GPUPDF(object): self.eval_bincount_gpu.fill(0) self.nearest_mc_gpu.fill(1e9) - def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64): + @profile_if_possible + def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000): "Add the most recent results of run_daq() to the PDF evaluation." - self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only), - np.int32(len(self.event_hit_gpu)), + self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32) + self.work_queues.fill(1) + + self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size), + np.int32(gpuchannels.ndaq), self.event_hit_gpu, self.event_time_gpu, - self.event_charge_gpu, gpuchannels.t, - gpuchannels.q, self.eval_hitcount_gpu, self.eval_bincount_gpu, np.float32(self.min_twidth), np.float32(self.trange[0]), np.float32(self.trange[1]), - np.float32(self.min_qwidth), - np.float32(self.qrange[0]), - np.float32(self.qrange[1]), - self.nearest_mc_gpu, np.int32(self.min_bin_content), + self.map_channel_id_to_hit_offset_gpu, + self.work_queues, block=(nthreads_per_block,1,1), - grid=(len(gpuchannels.t)//nthreads_per_block+1,1)) + grid=(self.event_hit_gpu.size//nthreads_per_block+1,1)) + self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit), + np.int32(gpuchannels.ndaq), + self.map_hit_offset_to_channel_id_gpu, + self.work_queues, + self.event_time_gpu, + gpuchannels.t, + self.nearest_mc_gpu, + np.int32(self.min_bin_content), + block=(nthreads_per_block,1,1), + grid=(self.event_nhit,1)) + cuda.Context.get_current().synchronize() def get_pdf_eval(self): evhit = self.event_hit_gpu.get().astype(bool) @@ -325,14 +346,16 @@ class GPUPDF(object): # PDF value for low stats bins low_stats = ~high_stats & (hitcount > 0) & evhit - nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content)) + nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content)) + nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32) + nearest_mc.fill(1e9) + nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit # Deal with the case where we did not even get min_bin_content events # in the PDF but also clamp the lower range to ensure we don't index # by a negative number in 2 lines last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry] - if low_stats.any(): if self.time_only: pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0 @@ -344,5 +367,4 @@ class GPUPDF(object): # PDFs with no stats got zero by default during array creation print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() - return hitcount, pdf_value, pdf_value * pdf_frac_uncert diff --git a/chroma/likelihood.py b/chroma/likelihood.py index 5e8e678..a29dc6c 100644 --- a/chroma/likelihood.py +++ b/chroma/likelihood.py @@ -57,11 +57,12 @@ class Likelihood(object): hitcount, pdf_prob, pdf_prob_uncert = \ self.sim.eval_pdf(self.event.channels, vertex_generator, - 0.1, self.trange, + 0.2, self.trange, 1, self.qrange, nreps=nreps, ndaq=ndaq, - time_only=self.time_only) + time_only=self.time_only, + min_bin_content=ndaq*10) # Normalize probabilities and put a floor to keep the log finite hit_prob = hitcount.astype(np.float32) / ntotal @@ -80,7 +81,6 @@ class Likelihood(object): return hit_prob, pdf_prob, pdf_prob_uncert - @profile_if_possible def eval(self, vertex_generator, nevals, nreps=16, ndaq=50): """ Return the negative log likelihood that the detector event set in the @@ -100,9 +100,8 @@ class Likelihood(object): # Then include the probability densities of the observed # charges and times. - hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit], - pdf_prob_uncert[self.event.channels.hit])) - log_likelihood += unumpy.log(hit_pdf_ufloat).sum() + log_likelihood += ufloat((np.log(pdf_prob[self.event.channels.hit]).sum(), + 0.0)) return -log_likelihood diff --git a/chroma/sim.py b/chroma/sim.py index 93bcdfb..a82f1f8 100644 --- a/chroma/sim.py +++ b/chroma/sim.py @@ -9,13 +9,14 @@ from chroma import event from chroma import itertoolset from chroma.tools import profile_if_possible +import pycuda.driver as cuda + def pick_seed(): """Returns a seed for a random number generator selected using a mixture of the current time and the current process ID.""" return int(time.time()) ^ (os.getpid() << 16) class Simulation(object): - @profile_if_possible def __init__(self, detector, seed=None, cuda_device=None, geant4_processes=4, nthreads_per_block=64, max_blocks=1024): self.detector = detector @@ -122,9 +123,12 @@ class Simulation(object): return self.gpu_pdf.get_pdfs() - def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=50, nreps=1, ndaq=1, time_only=True): + @profile_if_possible + def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=100, nreps=1, ndaq=1, time_only=True): """Returns tuple: 1D array of channel hit counts, 1D array of PDF probability densities.""" + gpu_daq = gpu.GPUDaq(self.gpu_geometry, ndaq=ndaq) + self.gpu_pdf.setup_pdf_eval(event_channels.hit, event_channels.t, event_channels.q, @@ -145,10 +149,16 @@ class Simulation(object): gpu_photons.propagate(self.gpu_geometry, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) - for gpu_photon_slice in gpu_photons.iterate_copies(): - for idaq in xrange(ndaq): - gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) - self.gpu_pdf.accumulate_pdf_eval(gpu_channels) + nphotons = gpu_photons.true_nphotons + for i in xrange(gpu_photons.ncopies): + start_photon = i * nphotons + gpu_photon_slice = gpu_photons.select(event.SURFACE_DETECT, + start_photon=start_photon, + nphotons=nphotons) + if len(gpu_photon_slice) == 0: + continue + gpu_channels = gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) + self.gpu_pdf.accumulate_pdf_eval(gpu_channels, 32) return self.gpu_pdf.get_pdf_eval() |