4 files changed, 422 insertions, 5 deletions
diff --git a/chroma/cuda/pdf.cu b/chroma/cuda/pdf.cu
index c9c2ea4..9f547d0 100644
--- a/chroma/cuda/pdf.cu
+++ b/chroma/cuda/pdf.cu
@@ -122,4 +122,143 @@ accumulate_pdf_eval(int time_only, int nchannels, unsigned int *event_hit,
     nearest_mc[offset+entry+1] = distance;
 }
 
+
+__global__ void
+accumulate_moments(int time_only, int nchannels,
+		   float *mc_time,
+		   unsigned int *mc_charge, // quirk of DAQ!
+		   float tmin, float tmax,
+		   float qmin, float qmax,
+		   unsigned int *mom0,
+		   float *t_mom1,
+		   float *t_mom2,
+		   float *q_mom1,
+		   float *q_mom2)
+
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+    if (id >= nchannels)
+	return;
+	
+    // Was this channel hit in the Monte Carlo?
+    float channel_mc_time = mc_time[id];
+
+    // Is this channel inside the range of the PDF?
+    if (time_only) {
+	if (channel_mc_time < tmin || channel_mc_time > tmax)
+	    return;  // Nothing else to do
+	
+	mom0[id] += 1;
+	t_mom1[id] += channel_mc_time;
+	t_mom2[id] += channel_mc_time*channel_mc_time;
+    }
+    else { // time and charge PDF
+	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+	
+	if (channel_mc_time < tmin || channel_mc_time > tmax ||
+	    channel_mc_charge < qmin || channel_mc_charge > qmax)
+	    return;  // Nothing else to do
+
+	mom0[id] += 1;
+	t_mom1[id] += channel_mc_time;
+	t_mom2[id] += channel_mc_time*channel_mc_time;
+	q_mom1[id] += channel_mc_charge;
+	q_mom2[id] += channel_mc_charge*channel_mc_charge;
+    }
+}
+
+static const float invroot2 = 0.70710678118654746f; // 1/sqrt(2)
+static const float rootPiBy2 = 1.2533141373155001f; // sqrt(M_PI/2)
+
+__global__ void
+accumulate_kernel_eval(int time_only, int nchannels, unsigned int *event_hit,
+		       float *event_time, float *event_charge, float *mc_time,
+		       unsigned int *mc_charge, // quirk of DAQ!
+		       float tmin, float tmax,
+		       float qmin, float qmax,
+		       float *inv_time_bandwidths,
+		       float *inv_charge_bandwidths,
+		       unsigned int *hitcount,
+		       float *pdf_values)
+{
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+    if (id >= nchannels)
+	return;
+	
+    // Was this channel hit in the Monte Carlo?
+    float channel_mc_time = mc_time[id];
+	
+    // Is this channel inside the range of the PDF?
+    if (time_only) {
+	if (channel_mc_time < tmin || channel_mc_time > tmax)
+	    return;  // Nothing else to do
+		
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
+	  
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
+
+	// Kernel argument
+	float channel_event_time = event_time[id];
+	float inv_bandwidth = inv_time_bandwidths[id];
+	float arg = (channel_mc_time - channel_event_time) * inv_bandwidth;
+
+	// evaluate 1D Gaussian normalized within time window
+	float term = expf(-0.5f * arg * arg) * inv_bandwidth;
+
+	float norm = tmax - tmin;
+	if (inv_bandwidth > 0.0f) {
+	  float loarg = (tmin - channel_mc_time)*inv_bandwidth*invroot2;
+	  float hiarg = (tmax - channel_mc_time)*inv_bandwidth*invroot2;
+	  norm = (erff(hiarg) - erff(loarg)) * rootPiBy2;
+	}
+	pdf_values[id] += term / norm;
+    }
+    else { // time and charge PDF
+	float channel_mc_charge = mc_charge[id]; // int->float conversion because DAQ just returns an integer
+	
+	if (channel_mc_time < tmin || channel_mc_time > tmax ||
+	    channel_mc_charge < qmin || channel_mc_charge > qmax)
+	    return;  // Nothing else to do
+		
+	// This MC information is contained in the PDF
+	hitcount[id] += 1;
+	  
+	// Was this channel hit in the event-of-interest?
+	int channel_event_hit = event_hit[id];
+	if (!channel_event_hit)
+	    return; // No need to update PDF value for unhit channel
+
+
+	// Kernel argument: time dim
+	float channel_event_obs = event_time[id];
+	float inv_bandwidth = inv_time_bandwidths[id];
+	float arg = (channel_mc_time - channel_event_obs) * inv_bandwidth;
+
+	float loarg = (tmin - channel_mc_time)*inv_bandwidth*invroot2;
+	float hiarg = (tmax - channel_mc_time)*inv_bandwidth*invroot2;
+	float norm = (erff(hiarg) - erff(loarg)) * rootPiBy2;
+
+	// Kernel argument: charge dim
+	channel_event_obs = event_time[id];
+	inv_bandwidth = inv_time_bandwidths[id];
+	arg *= (channel_mc_charge - channel_event_obs) * inv_bandwidth;
+
+	loarg = (tmin - channel_mc_charge)*inv_bandwidth*invroot2;
+	hiarg = (tmax - channel_mc_charge)*inv_bandwidth*invroot2;
+	norm *= (erff(hiarg) - erff(loarg)) * rootPiBy2;
+
+	// evaluate 2D Gaussian normalized within time window
+	float term = expf(-0.5f * arg * arg);
+
+	pdf_values[id] += term / norm;
+    }
+}
+
+
 } // extern "C"
diff --git a/chroma/gpu/pdf.py b/chroma/gpu/pdf.py
index 0ad78d2..ba0d2af 100644
--- a/chroma/gpu/pdf.py
+++ b/chroma/gpu/pdf.py
@@ -3,6 +3,161 @@ from pycuda import gpuarray as ga
 
 from chroma.gpu.tools import get_cu_module, cuda_options, GPUFuncs
 
+class GPUKernelPDF(object):
+    def __init__(self):
+        self.module = get_cu_module('pdf.cu', options=cuda_options,
+                                    include_source_directory=False)
+        self.gpu_funcs = GPUFuncs(self.module)
+
+    def setup_moments(self, nchannels, trange, qrange, time_only=True):
+        """Setup GPU arrays to accumulate moments and eventually
+        compute a kernel estimate of PDF values for each hit channel.
+
+            trange: (float, float)
+              Range of time dimension in PDF
+            qrange: (float, float)
+              Range of charge dimension in PDF
+            time_only: bool
+              If True, only the time observable will be used in the PDF.
+        """
+        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
+        self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
+        self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32)
+        self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
+        self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32)
+
+        self.trange = trange
+        self.qrange = qrange
+        self.time_only = time_only
+
+    def clear_moments(self):
+        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
+        self.hitcount_gpu.fill(0)
+        self.tmom1_gpu.fill(0.0)
+        self.tmom2_gpu.fill(0.0)
+        self.qmom1_gpu.fill(0.0)
+        self.qmom2_gpu.fill(0.0)
+
+    def accumulate_moments(self, gpuchannels, nthreads_per_block=64):
+        """Add the most recent results of run_daq() to the accumulate of 
+        moments for future bandwidth calculation."""
+        self.gpu_funcs.accumulate_moments(np.int32(self.time_only),
+                                          np.int32(len(gpuchannels.t)),
+                                          gpuchannels.t,
+                                          gpuchannels.q,
+                                          np.float32(self.trange[0]),
+                                          np.float32(self.trange[1]),
+                                          np.float32(self.qrange[0]),
+                                          np.float32(self.qrange[1]),
+                                          self.hitcount_gpu,
+                                          self.tmom1_gpu,
+                                          self.tmom2_gpu,
+                                          self.qmom1_gpu,
+                                          self.qmom2_gpu,
+                                          block=(nthreads_per_block,1,1), 
+                                          grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+        
+    def compute_bandwidth(self, scale_factor=1.0):
+        """Use the MC information accumulated by accumulate_moments() to
+        estimate the best bandwidth to use when kernel estimating."""
+
+        hitcount = self.hitcount_gpu.get()
+        mom0 = np.maximum(hitcount, 1)
+        tmom1 = self.tmom1_gpu.get()
+        tmom2 = self.tmom2_gpu.get()
+
+        tmean = tmom1 / mom0
+        tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0) # roundoff can go neg
+        trms = tvar**0.5
+
+        if self.time_only:
+            d = 1
+        else:
+            d = 2
+        dimensionality_factor = ((4.0/(d+2)) / (mom0/scale_factor))**(-1.0/(d+4))
+                                 
+        time_bandwidths = dimensionality_factor * trms 
+        inv_time_bandwidths = np.zeros_like(time_bandwidths)
+        inv_time_bandwidths[time_bandwidths  > 0] = time_bandwidths[time_bandwidths > 0] ** -1
+        #inv_time_bandwidths /= 4.0
+
+        # precompute inverse to speed up GPU evaluation
+        self.inv_time_bandwidths_gpu = ga.to_gpu(
+            inv_time_bandwidths.astype(np.float32)
+            )
+
+        # Compute charge bandwidths if needed
+        if self.time_only:
+            self.inv_charge_bandwidths_gpu = ga.empty_like(
+                self.inv_time_bandwidths_gpu
+                )
+            self.inv_charge_bandwidths_gpu.fill(0.0)
+        else:
+            qmom1 = self.qmom1_gpu.get()
+            qmom2 = self.qmom2_gpu.get()
+
+            qmean = qmom1 / mom0
+            qrms = (qmom2 / mom0 - qmean**2)**0.5
+            charge_bandwidths = dimensionality_factor * qrms
+
+            # precompute inverse to speed up GPU evaluation
+            self.inv_charge_bandwidths_gpu = ga.to_gpu( 
+                (charge_bandwidths**-1).astype(np.float32)
+                )
+
+    def setup_kernel(self, event_hit, event_time, event_charge):
+        """Setup GPU arrays to accumulate moments and eventually
+        compute a kernel estimate of PDF values for each hit channel.
+
+            event_hit: ndarray
+              Hit or not-hit status for each channel in the detector.
+            event_time: ndarray
+              Hit time for each channel in the detector.  If channel 
+              not hit, the time will be ignored.
+            event_charge: ndarray
+              Integrated charge for each channel in the detector.
+              If channel not hit, the charge will be ignored.
+        """
+        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
+        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
+        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
+        self.hitcount_gpu.fill(0)
+        self.pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)
+
+    def clear_kernel(self):
+        self.hitcount_gpu.fill(0)
+        self.pdf_values_gpu.fill(0.0)
+            
+    def accumulate_kernel(self, gpuchannels, nthreads_per_block=64):
+        "Add the most recent results of run_daq() to the kernel PDF evaluation."
+        self.gpu_funcs.accumulate_kernel_eval(np.int32(self.time_only),
+                                              np.int32(len(self.event_hit_gpu)),
+                                              self.event_hit_gpu,
+                                              self.event_time_gpu,
+                                              self.event_charge_gpu,
+                                              gpuchannels.t,
+                                              gpuchannels.q,
+                                              np.float32(self.trange[0]),
+                                              np.float32(self.trange[1]),
+                                              np.float32(self.qrange[0]),
+                                              np.float32(self.qrange[1]),
+                                              self.inv_time_bandwidths_gpu,
+                                              self.inv_charge_bandwidths_gpu,
+                                              self.hitcount_gpu,
+                                              self.pdf_values_gpu,
+                                              block=(nthreads_per_block,1,1), 
+                                              grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
+
+
+    def get_kernel_eval(self):
+        evhit = self.event_hit_gpu.get().astype(bool)
+        hitcount = self.hitcount_gpu.get()
+
+        pdf_values = self.pdf_values_gpu.get()
+        pdf_values /= np.maximum(1, hitcount) # avoid divide by zero
+
+        return hitcount, pdf_values, np.zeros_like(pdf_values)
+
 class GPUPDF(object):
     def __init__(self):
         self.module = get_cu_module('pdf.cu', options=cuda_options,
diff --git a/chroma/likelihood.py b/chroma/likelihood.py
index 49b04a7..c69dc72 100644
--- a/chroma/likelihood.py
+++ b/chroma/likelihood.py
@@ -1,4 +1,5 @@
 import numpy as np
+from math import sqrt
 from uncertainties import ufloat, unumpy
 from itertools import islice, izip, repeat
 from chroma.tools import profile_if_possible
@@ -78,20 +79,89 @@ class Likelihood(object):
 
         # NLL calculation: note that negation is at the end
         # Start with the probabilties of hitting (or not) the channels
-        hit_channel_prob = np.log(hit_prob[self.event.channels.hit]).sum() + np.log(1.0-hit_prob[~self.event.channels.hit]).sum()
+        hit_channel_prob = np.log(hit_prob[self.event.channels.hit]).sum() + np.log(1.0-hit_prob[~self.event.channels.hit])[1:].sum()
         hit_channel_prob_uncert = ( (ntotal * hit_prob * (1.0 - hit_prob)) / hit_prob**2 ).sum()**0.5
         log_likelihood = ufloat((hit_channel_prob, 0.0))
+        log_likelihood = ufloat((0.0,0.0)) # FIXME: skipping hit/not hit probabilities for now
 
         # Then include the probability densities of the observed
         # charges and times.
         hit_pdf_ufloat = unumpy.uarray((pdf_prob[self.event.channels.hit],
                                         pdf_prob_uncert[self.event.channels.hit]))
         log_likelihood += unumpy.log(hit_pdf_ufloat).sum()
+        
+        return -log_likelihood, pdf_prob
+
+    def setup_kernel(self, vertex_generator, nevals, nreps, ndaq, oversample_factor):
+        bandwidth_generator = islice(vertex_generator, nevals*oversample_factor)
 
-        return -log_likelihood
+        self.sim.setup_kernel(len(self.event.channels.hit),
+                              bandwidth_generator,
+                              self.trange,
+                              self.qrange,
+                              nreps=nreps,
+                              ndaq=ndaq,
+                              time_only=self.time_only,
+                              scale_factor=oversample_factor)
+        
+
+    @profile_if_possible
+    def eval_kernel(self, vertex_generator, nevals, nreps=16, ndaq=50, navg=10):
+        """
+        Return the negative log likelihood that the detector event set in the
+        constructor or by set_event() was the result of a particle generated
+        by `vertex_generator`.  If `nreps` set to > 1, each set of photon
+        vertices will be propagated `nreps` times.
+        """
+        ntotal = nevals * nreps * ndaq
+
+        mom1 = 0.0
+        mom2 = 0.0
+        for i in xrange(navg):
+            kernel_generator = islice(vertex_generator, nevals)
+            hitcount, pdf_prob, pdf_prob_uncert = \
+                self.sim.eval_kernel(self.event.channels,
+                                     kernel_generator,
+                                     self.trange, 
+                                     self.qrange,
+                                     nreps=nreps,
+                                     ndaq=ndaq,
+                                     time_only=self.time_only)
+        
+            # Normalize probabilities and put a floor to keep the log finite
+            hit_prob = hitcount.astype(np.float32) / ntotal
+            hit_prob = np.maximum(hit_prob, 0.5 / ntotal)
+
+            # Set all zero or nan probabilities to limiting PDF value
+            bad_value = (pdf_prob <= 0.0) | np.isnan(pdf_prob)
+            if self.time_only:
+                pdf_floor = 1.0 / (self.trange[1] - self.trange[0])
+            else:
+                pdf_floor = 1.0 / (self.trange[1] - self.trange[0]) / (self.qrange[1] - self.qrange[0])
+            pdf_prob[bad_value] = pdf_floor
+            pdf_prob_uncert[bad_value] = pdf_floor
+
+            print 'channels with no data:', (bad_value & self.event.channels.hit).astype(int).sum()
+
+            # NLL calculation: note that negation is at the end
+            # Start with the probabilties of hitting (or not) the channels
+            log_likelihood = np.log(hit_prob[self.event.channels.hit]).sum() + np.log(1.0-hit_prob[~self.event.channels.hit])[1:].sum()
+            log_likelihood = 0.0 # FIXME: Skipping hit/not-hit probabilities for now
+
+            # Then include the probability densities of the observed
+            # charges and times.
+            log_likelihood += np.log(pdf_prob[self.event.channels.hit]).sum()
+            print 'll', log_likelihood
+            mom1 += log_likelihood
+            mom2 += log_likelihood**2
+        
+        avg_like = mom1 / navg
+        rms_like = (mom2 / navg - avg_like**2)**0.5
+        # Don't forget to return a negative log likelihood
+        return ufloat((-avg_like, rms_like/sqrt(navg))), pdf_prob
 
 if __name__ == '__main__':
-    from chroma import detectors
+    from chroma.demo import detector as build_detector
     from chroma.sim import Simulation
     from chroma.generator import constant_particle_gun
     from chroma import tools
@@ -99,7 +169,7 @@ if __name__ == '__main__':
 
     tools.enable_debug_on_crash()
 
-    detector = detectors.find('lbne')
+    detector = build_detector()
     sim = Simulation(detector, seed=0)
 
     event = sim.simulate(islice(constant_particle_gun('e-',(0,0,0),(1,0,0),100.0), 1)).next()
diff --git a/chroma/sim.py b/chroma/sim.py
index 2e8f2f9..d15c02d 100644
--- a/chroma/sim.py
+++ b/chroma/sim.py
@@ -44,6 +44,7 @@ class Simulation(object):
         self.gpu_geometry = gpu.GPUGeometry(detector)
         self.gpu_daq = gpu.GPUDaq(self.gpu_geometry, max(self.detector.pmtids))
         self.gpu_pdf = gpu.GPUPDF()
+        self.gpu_pdf_kernel = gpu.GPUKernelPDF()
 
         self.rng_states = gpu.get_rng_states(self.nthreads_per_block*self.max_blocks, seed=self.seed)
 
@@ -115,7 +116,7 @@ class Simulation(object):
         
         return self.gpu_pdf.get_pdfs()
 
-    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=20, nreps=1, ndaq=1, time_only=True):
+    def eval_pdf(self, event_channels, iterable, min_twidth, trange, min_qwidth, qrange, min_bin_content=200, nreps=1, ndaq=1, time_only=True):
         """Returns tuple: 1D array of channel hit counts, 1D array of PDF
         probability densities."""
         self.gpu_pdf.setup_pdf_eval(event_channels.hit,
@@ -145,5 +146,57 @@ class Simulation(object):
         
         return self.gpu_pdf.get_pdf_eval()
 
+    def setup_kernel(self, nchannels, bandwidth_iterable,
+                         trange, qrange, 
+                         nreps=1, ndaq=1, time_only=True, scale_factor=1.0):
+        '''Call this before calling eval_pdf_kernel().  Sets up the
+        event information and computes an appropriate kernel bandwidth'''
+        self.gpu_pdf_kernel.setup_moments(nchannels, trange, qrange,
+                                          time_only=True)
+        # Compute bandwidth
+        first_element, bandwidth_iterable = itertoolset.peek(bandwidth_iterable)
+        if isinstance(first_element, event.Event):
+            bandwidth_iterable = \
+                self.photon_generator.generate_events(bandwidth_iterable)
+        for ev in bandwidth_iterable:
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg, ncopies=nreps)
+            gpu_photons.propagate(self.gpu_geometry, self.rng_states,
+                                  nthreads_per_block=self.nthreads_per_block,
+                                  max_blocks=self.max_blocks)
+            for gpu_photon_slice in gpu_photons.iterate_copies():
+                for idaq in xrange(ndaq):
+                    gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+                    self.gpu_pdf_kernel.accumulate_moments(gpu_channels)
+            
+        self.gpu_pdf_kernel.compute_bandwidth(scale_factor=scale_factor)
+
+    def eval_kernel(self, event_channels, 
+                    kernel_iterable,
+                    trange, qrange, 
+                    nreps=1, ndaq=1, naverage=1, time_only=True):
+        """Returns tuple: 1D array of channel hit counts, 1D array of PDF
+        probability densities."""
+
+        self.gpu_pdf_kernel.setup_kernel(event_channels.hit,
+                                         event_channels.t,
+                                         event_channels.q)
+        first_element, kernel_iterable = itertoolset.peek(kernel_iterable)
+        if isinstance(first_element, event.Event):
+            kernel_iterable = \
+                self.photon_generator.generate_events(kernel_iterable)
+
+        # Evaluate likelihood using this bandwidth
+        for ev in kernel_iterable:
+            gpu_photons = gpu.GPUPhotons(ev.photons_beg, ncopies=nreps)
+            gpu_photons.propagate(self.gpu_geometry, self.rng_states,
+                                  nthreads_per_block=self.nthreads_per_block,
+                                  max_blocks=self.max_blocks)
+            for gpu_photon_slice in gpu_photons.iterate_copies():
+                for idaq in xrange(ndaq):
+                    gpu_channels = self.gpu_daq.acquire(gpu_photon_slice, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks)
+                    self.gpu_pdf_kernel.accumulate_kernel(gpu_channels)
+        
+        return self.gpu_pdf_kernel.get_kernel_eval()
+
     def __del__(self):
         self.context.pop()