summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xchroma/benchmark.py12
-rw-r--r--chroma/cuda/daq.cu59
-rw-r--r--chroma/cuda/pdf.cu233
-rw-r--r--chroma/gpu/daq.py74
-rw-r--r--chroma/gpu/pdf.py60
-rw-r--r--chroma/likelihood.py11
-rw-r--r--chroma/sim.py22
7 files changed, 349 insertions, 122 deletions
diff --git a/chroma/benchmark.py b/chroma/benchmark.py
index 44cf44b..06a8eba 100755
--- a/chroma/benchmark.py
+++ b/chroma/benchmark.py
@@ -146,6 +146,7 @@ def pdf(gpu_detector, npdfs=10, nevents=100, nreps=16, ndaq=1,
return nevents*nreps*ndaq/ufloat((np.mean(run_times),np.std(run_times)))
+@tools.profile_if_possible
def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
nthreads_per_block=64, max_blocks=1024):
"""
@@ -179,6 +180,7 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
data_ev_channels = gpu_daq.acquire(gpu_photons, rng_states, nthreads_per_block, max_blocks).get()
# Setup PDF evaluation
+ gpu_daq = gpu.GPUDaq(gpu_detector, ndaq=ndaq)
gpu_pdf = gpu.GPUPDF()
gpu_pdf.setup_pdf_eval(data_ev_channels.hit,
data_ev_channels.t,
@@ -205,10 +207,10 @@ def pdf_eval(gpu_detector, npdfs=10, nevents=25, nreps=16, ndaq=128,
gpu_photons.propagate(gpu_detector, rng_states,
nthreads_per_block, max_blocks)
for gpu_photon_slice in gpu_photons.iterate_copies():
- for idaq in xrange(ndaq):
- gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
- nthreads_per_block, max_blocks)
- gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
+ gpu_photon_slice = gpu_photon_slice.select(event.SURFACE_DETECT)
+ gpu_channels = gpu_daq.acquire(gpu_photon_slice, rng_states,
+ nthreads_per_block, max_blocks)
+ gpu_pdf.accumulate_pdf_eval(gpu_channels, nthreads_per_block)
cuda.Context.get_current().synchronize()
elapsed = time.time() - t0
@@ -224,6 +226,8 @@ if __name__ == '__main__':
from chroma import demo
import gc
+ tools.enable_debug_on_crash()
+
# Default to run all tests
tests = ['ray', 'load', 'propagate', 'pdf', 'pdf_eval']
if len(sys.argv) > 1:
diff --git a/chroma/cuda/daq.cu b/chroma/cuda/daq.cu
index b995bfb..3ed402d 100644
--- a/chroma/cuda/daq.cu
+++ b/chroma/cuda/daq.cu
@@ -79,6 +79,65 @@ run_daq(curandState *s, unsigned int detection_state,
}
__global__ void
+run_daq_many(curandState *s, unsigned int detection_state,
+ int first_photon, int nphotons, float *photon_times,
+ unsigned int *photon_histories, int *last_hit_triangles,
+ int *solid_map,
+ Detector *detector,
+ unsigned int *earliest_time_int,
+ unsigned int *channel_q_int, unsigned int *channel_histories,
+ int ndaq, int channel_stride)
+{
+ __shared__ int photon_id;
+ __shared__ int triangle_id;
+ __shared__ int solid_id;
+ __shared__ int channel_index;
+ __shared__ unsigned int history;
+ __shared__ float photon_time;
+
+
+ if (threadIdx.x == 0) {
+ photon_id = first_photon + blockIdx.x;
+ triangle_id = last_hit_triangles[photon_id];
+
+ if (triangle_id > -1) {
+ solid_id = solid_map[triangle_id];
+ history = photon_histories[photon_id];
+ channel_index = detector->solid_id_to_channel_index[solid_id];
+ photon_time = photon_times[photon_id];
+ }
+ }
+
+ __syncthreads();
+
+ if (triangle_id == -1 || channel_index < 0 || !(history & detection_state))
+ return;
+
+ int id = threadIdx.x + blockDim.x * blockIdx.x;
+ curandState rng = s[id];
+
+ for (int i = threadIdx.x; i < ndaq; i += blockDim.x) {
+ int channel_offset = channel_index + i * channel_stride;
+
+ float time = photon_time + curand_normal(&rng) * 1.2f;// +
+ //sample_cdf(&rng, detector->time_cdf_len,
+ // detector->time_cdf_x, detector->time_cdf_y);
+ unsigned int time_int = float_to_sortable_int(time);
+
+ float charge = 1.0f; //sample_cdf(&rng, detector->charge_cdf_len,
+ //detector->charge_cdf_x,
+ //detector->charge_cdf_y);
+ unsigned int charge_int = roundf(charge / detector->charge_unit);
+
+ atomicMin(earliest_time_int + channel_offset, time_int);
+ atomicAdd(channel_q_int + channel_offset, charge_int);
+ atomicOr(channel_histories + channel_offset, history);
+ }
+
+ s[id] = rng;
+}
+
+__global__ void
convert_sortable_int_to_float(int n, unsigned int *sortable_ints,
float *float_output)
{
diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu
index bf42742..a9a9339 100644
--- a/chroma/cuda/pdf.cu
+++ b/chroma/cuda/pdf.cu
@@ -1,6 +1,8 @@
// -*-c++-*-
#include <curand_kernel.h>
+#include "sorting.h"
+
extern "C"
{
@@ -28,101 +30,196 @@ bin_hits(int nchannels, float *channel_q, float *channel_time,
pdf[bin] += 1;
}
}
-
+
__global__ void
-accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
- float *event_time, float *event_charge, float *mc_time,
- float *mc_charge, // quirk of DAQ!
+accumulate_bincount(int nchannels,
+ int ndaq,
+ unsigned int *event_hit,
+ float *event_time,
+ float *mc_time,
unsigned int *hitcount, unsigned int *bincount,
float min_twidth, float tmin, float tmax,
- float min_qwidth, float qmin, float qmax,
- float *nearest_mc, int min_bin_content)
+ int min_bin_content,
+ unsigned int *map_channel_id_to_hit_offset,
+ unsigned int *work_queues)
{
- int id = threadIdx.x + blockDim.x * blockIdx.x;
+ int channel_id = threadIdx.x + blockDim.x * blockIdx.x;
- if (id >= nchannels)
+ if (channel_id >= nchannels)
return;
+
+ float channel_hitcount = hitcount[channel_id];
+ float channel_bincount = bincount[channel_id];
+ float channel_event_time = event_time[channel_id];
+ int channel_event_hit = event_hit[channel_id];
+
+ unsigned int *work_queue = work_queues + map_channel_id_to_hit_offset[channel_id] * (ndaq + 1);
+ unsigned int next_slot = work_queue[0];
- // Was this channel hit in the Monte Carlo?
- float channel_mc_time = mc_time[id];
- if (channel_mc_time >= 1e8)
- return; // Nothing else to do
+ for (int i=0; i < ndaq; i++) {
+ int read_offset = nchannels * i + channel_id;
+
+ // Was this channel hit in the Monte Carlo?
+ float channel_mc_time = mc_time[read_offset];
+ if (channel_mc_time >= 1e8)
+ continue; // Nothing else to do
- // Is this channel inside the range of the PDF?
- float distance;
- int channel_bincount = bincount[id];
- if (time_only) {
+ // Is this channel inside the range of the PDF?
+ float distance;
if (channel_mc_time < tmin || channel_mc_time > tmax)
- return; // Nothing else to do
+ continue; // Nothing else to do
- // This MC information is contained in the PDF
- hitcount[id] += 1;
-
+ channel_hitcount += 1;
+
// Was this channel hit in the event-of-interest?
- int channel_event_hit = event_hit[id];
if (!channel_event_hit)
- return; // No need to update PDF value for unhit channel
-
+ continue; // No need to update PDF value for unhit channel
+
// Are we inside the minimum size bin?
- float channel_event_time = event_time[id];
distance = fabsf(channel_mc_time - channel_event_time);
- if (distance < min_twidth/2.0)
- bincount[id] = channel_bincount + 1;
+ if (distance < min_twidth/2.0) {
+ channel_bincount += 1;
+ }
+ // Add this hit to the work queue if we also need to sort it into the
+ // nearest_mc_list
+ if (channel_bincount < min_bin_content) {
+ work_queue[next_slot] = read_offset;
+ next_slot++;
+ }
}
- else { // time and charge PDF
- float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+
+ hitcount[channel_id] = channel_hitcount;
+ bincount[channel_id] = channel_bincount;
+ if (channel_event_hit)
+ work_queue[0] = next_slot;
+}
- if (channel_mc_time < tmin || channel_mc_time > tmax ||
- channel_mc_charge < qmin || channel_mc_charge > qmax)
- return; // Nothing else to do
-
- // This MC information is contained in the PDF
- hitcount[id] += 1;
-
- // Was this channel hit in the event-of-interest?
- int channel_event_hit = event_hit[id];
- if (!channel_event_hit)
- return; // No need to update PDF value for unhit channel
-
- // Are we inside the minimum size bin?
- float channel_event_time = event_time[id];
- float channel_event_charge = event_charge[id];
- float normalized_time_distance = fabsf(channel_event_time - channel_mc_time)/min_twidth/2.0;
- float normalized_charge_distance = fabsf(channel_event_charge - channel_mc_charge)/min_qwidth/2.0;
- distance = sqrt(normalized_time_distance*normalized_time_distance + normalized_charge_distance*normalized_charge_distance);
+__global__ void
+accumulate_nearest_neighbor(int nhit,
+ int ndaq,
+ unsigned int *map_hit_offset_to_channel_id,
+ unsigned int *work_queues,
+ float *event_time,
+ float *mc_time,
+ float *nearest_mc, int min_bin_content)
+{
+ int hit_id = threadIdx.x + blockDim.x * blockIdx.x;
+
+ if (hit_id >= nhit)
+ return;
+
+ unsigned int *work_queue = work_queues + hit_id * (ndaq + 1);
+ int queue_items = work_queue[0] - 1;
+
+ int channel_id = map_hit_offset_to_channel_id[hit_id];
+ float channel_event_time = event_time[channel_id];
+
+ float distance_table[1000];
+ int distance_table_len = 0;
+
+ // Load existing distance table
+ int offset = min_bin_content * hit_id;
+ for (int i=0; i < min_bin_content; i++) {
+ float d = nearest_mc[offset + i];
+ if (d > 1e8)
+ break;
- if (distance < 1.0f)
- bincount[id] = channel_bincount + 1;
+ distance_table[distance_table_len] = d;
+ distance_table_len++;
}
+
+ // append new entries
+ for (int i=0; i < queue_items; i++) {
+ unsigned int read_offset = work_queue[i+1];
+ float channel_mc_time = mc_time[read_offset];
+ float distance = fabsf(channel_mc_time - channel_event_time);
- // Do we need to keep updating the nearest_mc list?
- if (channel_bincount >= min_bin_content)
- return; // No need to perform insertion into nearest_mc because bincount is a better estimate of the PDF value
+ distance_table[distance_table_len] = distance;
+ distance_table_len++;
+ }
- // insertion sort the distance into the array of nearest MC points
- int offset = min_bin_content * id;
- int entry = min_bin_content - 1;
-
- // If last entry less than new entry, nothing to update
- if (distance > nearest_mc[offset + entry])
- return;
+ // Sort table
+ piksrt(distance_table_len, distance_table);
+
+ // Copy first section of table back out to global memory
+ distance_table_len = min(distance_table_len, min_bin_content);
+ for (int i=0; i < distance_table_len; i++) {
+ nearest_mc[offset + i] = distance_table[i];
+ }
+}
+
+__global__ void
+accumulate_nearest_neighbor_block(int nhit,
+ int ndaq,
+ unsigned int *map_hit_offset_to_channel_id,
+ unsigned int *work_queues,
+ float *event_time,
+ float *mc_time,
+ float *nearest_mc, int min_bin_content)
+{
+ int hit_id = blockIdx.x;
+
+ __shared__ float distance_table[1000];
+ __shared__ unsigned int *work_queue;
+ __shared__ int queue_items;
+ __shared__ int channel_id;
+ __shared__ float channel_event_time;
+ __shared__ int distance_table_len;
+ __shared__ int offset;
+
+ if (threadIdx.x == 0) {
+ work_queue = work_queues + hit_id * (ndaq + 1);
+ queue_items = work_queue[0] - 1;
+
+ channel_id = map_hit_offset_to_channel_id[hit_id];
+ channel_event_time = event_time[channel_id];
+ distance_table_len = min_bin_content;
+ offset = min_bin_content * hit_id;
+ }
+
+ __syncthreads();
- // Find where to insert the new entry while sliding the rest
- // to the right
- entry--;
- while (entry >= 0) {
- if (nearest_mc[offset+entry] >= distance)
- nearest_mc[offset+entry+1] = nearest_mc[offset+entry];
- else
- break;
- entry--;
+ // Load existing distance table
+ for (int i=threadIdx.x; i < min_bin_content; i += blockDim.x) {
+ float d = nearest_mc[offset + i];
+ if (d > 1e8) {
+ atomicMin(&distance_table_len, i);
+ break;
+ }
+ distance_table[i] = d;
}
+
+ __syncthreads();
- nearest_mc[offset+entry+1] = distance;
+ // append new entries
+ for (int i=threadIdx.x; i < queue_items; i += blockDim.x) {
+ unsigned int read_offset = work_queue[i+1];
+ float channel_mc_time = mc_time[read_offset];
+ float distance = fabsf(channel_mc_time - channel_event_time);
+
+ distance_table[distance_table_len + i] = distance;
+ }
+
+ __syncthreads();
+
+ if (threadIdx.x == 0) {
+ distance_table_len += queue_items;
+ // Sort table
+ piksrt(distance_table_len, distance_table);
+ // Copy first section of table back out to global memory
+ distance_table_len = min(distance_table_len, min_bin_content);
+ }
+
+ __syncthreads();
+
+ for (int i=threadIdx.x; i < distance_table_len; i += blockDim.x) {
+ nearest_mc[offset + i] = distance_table[i];
+ }
}
+
__global__ void
accumulate_moments(int time_only, int nchannels,
float *mc_time,
diff --git a/chroma/gpu/daq.py b/chroma/gpu/daq.py
index 571feba..ec5add4 100644
--- a/chroma/gpu/daq.py
+++ b/chroma/gpu/daq.py
@@ -1,15 +1,27 @@
import numpy as np
from pycuda import gpuarray as ga
+from pycuda import driver as cuda
from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, \
chunk_iterator
from chroma import event
class GPUChannels(object):
- def __init__(self, t, q, flags):
+ def __init__(self, t, q, flags, ndaq=1, stride=None):
self.t = t
self.q = q
self.flags = flags
+ self.ndaq = ndaq
+ if stride is None:
+ self.stride = len(t)
+ else:
+ self.stride = stride
+
+ def iterate_copies(self):
+ for i in xrange(self.ndaq):
+ yield GPUChannels(self.t[i*self.stride:(i+1)*self.stride],
+ self.q[i*self.stride:(i+1)*self.stride],
+ self.flags[i*self.stride:(i+1)*self.stride])
def get(self):
t = self.t.get()
@@ -19,10 +31,13 @@ class GPUChannels(object):
# enough hit time were hit.
return event.Channels(t<1e8, t, q, self.flags.get())
+ def __len__(self):
+ return self.t.size
+
class GPUDaq(object):
- def __init__(self, gpu_detector):
- self.earliest_time_gpu = ga.empty(gpu_detector.nchannels, dtype=np.float32)
- self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels, dtype=np.uint32)
+ def __init__(self, gpu_detector, ndaq=1):
+ self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32)
+ self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32)
self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32)
@@ -33,29 +48,50 @@ class GPUDaq(object):
self.module = get_cu_module('daq.cu', options=cuda_options,
include_source_directory=True)
self.gpu_funcs = GPUFuncs(self.module)
+ self.ndaq = ndaq
+ self.stride = gpu_detector.nchannels
- def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024):
- self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
+ def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None):
+ self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(64,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
self.channel_q_int_gpu.fill(0)
self.channel_q_gpu.fill(0)
self.channel_history_gpu.fill(0)
- n = len(gpuphotons.pos)
-
- for first_photon, photons_this_round, blocks in \
- chunk_iterator(n, nthreads_per_block, max_blocks):
- self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2),
- np.int32(first_photon), np.int32(photons_this_round), gpuphotons.t,
- gpuphotons.flags, gpuphotons.last_hit_triangles,
- self.solid_id_map_gpu,
- self.detector_gpu,
- self.earliest_time_int_gpu,
- self.channel_q_int_gpu, self.channel_history_gpu,
- block=(nthreads_per_block,1,1), grid=(blocks,1))
+ if start_photon is None:
+ start_photon = 0
+ if nphotons is None:
+ nphotons = len(gpuphotons.pos) - start_photon
+ if self.ndaq == 1:
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(nphotons, nthreads_per_block, max_blocks):
+ self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2),
+ np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t,
+ gpuphotons.flags, gpuphotons.last_hit_triangles,
+ self.solid_id_map_gpu,
+ self.detector_gpu,
+ self.earliest_time_int_gpu,
+ self.channel_q_int_gpu, self.channel_history_gpu,
+ block=(nthreads_per_block,1,1), grid=(blocks,1))
+ else:
+ for first_photon, photons_this_round, blocks in \
+ chunk_iterator(nphotons, 1, max_blocks):
+ self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2),
+ np.int32(start_photon), np.int32(photons_this_round), gpuphotons.t,
+ gpuphotons.flags, gpuphotons.last_hit_triangles,
+ self.solid_id_map_gpu,
+ self.detector_gpu,
+ self.earliest_time_int_gpu,
+ self.channel_q_int_gpu, self.channel_history_gpu,
+ np.int32(self.ndaq), np.int32(self.stride),
+ block=(nthreads_per_block,1,1), grid=(blocks,1))
+ cuda.Context.get_current().synchronize()
+
self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1))
- return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu)
+ cuda.Context.get_current().synchronize()
+
+ return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py
index bd3c448..1b6340f 100644
--- a/chroma/gpu/pdf.py
+++ b/chroma/gpu/pdf.py
@@ -1,12 +1,13 @@
import numpy as np
from pycuda import gpuarray as ga
import pycuda.driver as cuda
-from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs
+from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs, chunk_iterator
+from chroma.tools import profile_if_possible
class GPUKernelPDF(object):
def __init__(self):
self.module = get_cu_module('pdf.cu', options=cuda_options,
- include_source_directory=False)
+ include_source_directory=True)
self.gpu_funcs = GPUFuncs(self.module)
def setup_moments(self, nchannels, trange, qrange, time_only=True):
@@ -56,7 +57,6 @@ class GPUKernelPDF(object):
self.qmom2_gpu,
block=(nthreads_per_block,1,1),
grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
- cuda
def compute_bandwidth(self, event_hit, event_time, event_charge,
scale_factor=1.0):
@@ -177,7 +177,7 @@ class GPUKernelPDF(object):
class GPUPDF(object):
def __init__(self):
self.module = get_cu_module('pdf.cu', options=cuda_options,
- include_source_directory=False)
+ include_source_directory=True)
self.gpu_funcs = GPUFuncs(self.module)
def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
@@ -259,21 +259,31 @@ class GPUPDF(object):
time_only: bool
If True, only the time observable will be used in the PDF.
"""
+ self.event_nhit = np.count_nonzero(event_hit)
+
+ # Define a mapping from an array of len(event_hit) to an array of length event_nhit
+ self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32)
+ self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id)
+ self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32)
+ self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset)
+
self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
- self.nearest_mc_gpu = ga.empty(shape=len(event_hit) * min_bin_content,
+ self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content,
dtype=np.float32)
self.nearest_mc_gpu.fill(1e9)
-
+
self.min_twidth = min_twidth
self.trange = trange
self.min_qwidth = min_qwidth
self.qrange = qrange
self.min_bin_content = min_bin_content
+
+ assert time_only # Only support time right now
self.time_only = time_only
def clear_pdf_eval(self):
@@ -282,27 +292,38 @@ class GPUPDF(object):
self.eval_bincount_gpu.fill(0)
self.nearest_mc_gpu.fill(1e9)
- def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64):
+ @profile_if_possible
+ def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000):
"Add the most recent results of run_daq() to the PDF evaluation."
- self.gpu_funcs.accumulate_pdf_eval(np.int32(self.time_only),
- np.int32(len(self.event_hit_gpu)),
+ self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32)
+ self.work_queues.fill(1)
+
+ self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size),
+ np.int32(gpuchannels.ndaq),
self.event_hit_gpu,
self.event_time_gpu,
- self.event_charge_gpu,
gpuchannels.t,
- gpuchannels.q,
self.eval_hitcount_gpu,
self.eval_bincount_gpu,
np.float32(self.min_twidth),
np.float32(self.trange[0]),
np.float32(self.trange[1]),
- np.float32(self.min_qwidth),
- np.float32(self.qrange[0]),
- np.float32(self.qrange[1]),
- self.nearest_mc_gpu,
np.int32(self.min_bin_content),
+ self.map_channel_id_to_hit_offset_gpu,
+ self.work_queues,
block=(nthreads_per_block,1,1),
- grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+ grid=(self.event_hit_gpu.size//nthreads_per_block+1,1))
+ self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit),
+ np.int32(gpuchannels.ndaq),
+ self.map_hit_offset_to_channel_id_gpu,
+ self.work_queues,
+ self.event_time_gpu,
+ gpuchannels.t,
+ self.nearest_mc_gpu,
+ np.int32(self.min_bin_content),
+ block=(nthreads_per_block,1,1),
+ grid=(self.event_nhit,1))
+ cuda.Context.get_current().synchronize()
def get_pdf_eval(self):
evhit = self.event_hit_gpu.get().astype(bool)
@@ -325,14 +346,16 @@ class GPUPDF(object):
# PDF value for low stats bins
low_stats = ~high_stats & (hitcount > 0) & evhit
- nearest_mc = self.nearest_mc_gpu.get().reshape((len(hitcount), self.min_bin_content))
+ nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content))
+ nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32)
+ nearest_mc.fill(1e9)
+ nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit
# Deal with the case where we did not even get min_bin_content events
# in the PDF but also clamp the lower range to ensure we don't index
# by a negative number in 2 lines
last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
-
if low_stats.any():
if self.time_only:
pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
@@ -344,5 +367,4 @@ class GPUPDF(object):
# PDFs with no stats got zero by default during array creation
print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
-
return hitcount, pdf_value, pdf_value * pdf_frac_uncert
diff --git a/chroma/likelihood.py b/chroma/likelihood.py
index 5e8e678..a29dc6c 100644
--- a/chroma/likelihood.py
+++ b/chroma/likelihood.py
@@ -57,11 +57,12 @@ class Likelihood(object):
hitcount, pdf_prob, pdf_prob_uncert = \
self.sim.eval_pdf(self.event.channels,
vertex_generator,
- 0.1, self.trange,
+ 0.2, self.trange,
1, self.qrange,
nreps=nreps,
ndaq=ndaq,
- time_only=self.time_only)
+ time_only=self.time_only,
+ min_bin_content=ndaq*10)
# Normalize probabilities and put a floor to keep the log finite
hit_prob = hitcount.astype(np.float32) / ntotal
@@ -80,7 +81,6 @@ class Likelihood(object):
return hit_prob, pdf_prob, pdf_prob_uncert
- @profile_if_possible
def eval(self, vertex_generator, nevals, nreps=16, ndaq=50):
"""
Return the negative log likelihood that the detector event set in the
@@ -100,9 +100,8 @@ class Likelihood(object):
# Then include the probability densities of the observed
# charges and times.
- hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit],
- pdf_prob_uncert[self.event.channels.hit]))
- log_likelihood += unumpy.log(hit_pdf_ufloat).sum()
+ log_likelihood += ufloat((np.log(pdf_prob[self.event.channels.hit]).sum(),
+ 0.0))
return -log_likelihood
diff --git a/chroma/sim.py b/chroma/sim.py
index 93bcdfb..a82f1f8 100644
--- a/chroma/sim.py
+++ b/chroma/sim.py
@@ -9,13 +9,14 @@ from chroma import event
from chroma import itertoolset
from chroma.tools import profile_if_possible
+import pycuda.driver as cuda
+
def pick_seed():
"""Returns a seed for a random number generator selected using
a mixture of the current time and the current process ID."""
return int(time.time()) ^ (os.getpid() << 16)
class Simulation(object):
- @profile_if_possible
def __init__(self, detector, seed=None, cuda_device=None,
geant4_processes=4, nthreads_per_block=64, max_blocks=1024):
self.detector = detector
@@ -122,9 +123,12 @@ class Simulation(object):
return self.gpu_pdf.get_pdfs()
- def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=50, nreps=1, ndaq=1, time_only=True):
+ @profile_if_possible
+ def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=100, nreps=1, ndaq=1, time_only=True):
"""Returns tuple: 1D array of channel hit counts, 1D array of PDF
probability densities."""
+ gpu_daq = gpu.GPUDaq(self.gpu_geometry, ndaq=ndaq)
+
self.gpu_pdf.setup_pdf_eval(event_channels.hit,
event_channels.t,
event_channels.q,
@@ -145,10 +149,16 @@ class Simulation(object):
gpu_photons.propagate(self.gpu_geometry, self.rng_states,
nthreads_per_block=self.nthreads_per_block,
max_blocks=self.max_blocks)
- for gpu_photon_slice in gpu_photons.iterate_copies():
- for idaq in xrange(ndaq):
- gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
- self.gpu_pdf.accumulate_pdf_eval(gpu_channels)
+ nphotons = gpu_photons.true_nphotons
+ for i in xrange(gpu_photons.ncopies):
+ start_photon = i * nphotons
+ gpu_photon_slice = gpu_photons.select(event.SURFACE_DETECT,
+ start_photon=start_photon,
+ nphotons=nphotons)
+ if len(gpu_photon_slice) == 0:
+ continue
+ gpu_channels = gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+ self.gpu_pdf.accumulate_pdf_eval(gpu_channels, 32)
return self.gpu_pdf.get_pdf_eval()