4 files changed, 178 insertions, 64 deletions
diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000..0f72811
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,99 @@
+import numpy as np
+from pycuda import gpuarray as ga
+import time
+from uncertainties import ufloat
+import sys
+
+from chroma.gpu import GPU, to_float3
+from chroma.camera import get_rays
+from chroma.event import Photons
+from chroma.sample import uniform_sphere
+
+def progress(seq):
+    "Print progress while iterating over `seq`."
+    n = len(seq)
+    print '[' + ' '*21 + ']\r[',
+    sys.stdout.flush()
+    for i, item in enumerate(seq):
+        if i % (n//10) == 0:
+            print '.',
+            sys.stdout.flush()
+        yield item
+    print ']'
+    sys.stdout.flush()
+
+def ray_trace(gpu, number=1000):
+    """
+    Return the number of mean and standard deviation of the number of ray
+    intersections per second as a ufloat for the geometry loaded onto `gpu`.
+
+    .. note::
+        The rays are thrown from a camera sitting *outside* of the geometry.
+
+    Args:
+        - gpu, chroma.gpu.GPU
+            The GPU object with a geometry already loaded.
+        - number, int
+            The number of kernel calls to average.
+    """
+    lb, ub = gpu.geometry.mesh.get_bounds()
+    scale = np.linalg.norm(ub-lb)
+    point = [0,scale,(lb[2]+ub[2])/2]
+
+    size = (800,600)
+    width, height = size
+
+    origins, directions = get_rays(point, size, 0.035, focal_length=0.018)
+
+    origins_gpu = ga.to_gpu(to_float3(origins))
+    directions_gpu = ga.to_gpu(to_float3(directions))
+    pixels_gpu = ga.zeros(width*height, dtype=np.int32)
+
+    run_times = []
+    for i in progress(range(number)):
+        t0 = time.time()
+        gpu.kernels.ray_trace(np.int32(pixels_gpu.size), origins_gpu, directions_gpu, pixels_gpu, block=(gpu.nthreads_per_block,1,1), grid=(pixels_gpu.size//gpu.nthreads_per_block+1,1))
+        gpu.context.synchronize()
+        elapsed = time.time() - t0
+        run_times.append(elapsed)
+
+    return pixels_gpu.size/ufloat((np.mean(run_times),np.std(run_times)))
+
+def propagate(gpu, number=10, nphotons=500000):
+    """
+    Return the mean and standard deviation of the number of photons propagated
+    per second as a ufloat for the geometry loaded onto `gpu`.
+
+    Args:
+        - gpu, chroma.gpu.GPU
+            The GPU object with a geometry already loaded.
+        - number, int
+            The number of kernel calls to average.
+        - nphotons, int
+            The number of photons to propagate per kernel call.
+    """
+    gpu.setup_propagate()
+
+    run_times = []
+    for i in progress(range(number)):
+        photons = Photons(np.zeros((nphotons,3)), uniform_sphere(nphotons), np.random.uniform(400,800,size=nphotons))
+        gpu.load_photons(photons)
+        t0 = time.time()
+        gpu.propagate()
+        gpu.context.synchronize()
+        elapsed = time.time() - t0
+        run_times.append(elapsed)
+
+    return nphotons/ufloat((np.mean(run_times),np.std(run_times)))
+
+if __name__ == '__main__':
+    from chroma.detectors import build_lbne_200kton, build_minilbne
+
+    lbne = build_lbne_200kton()
+    lbne.build(bits=11)
+
+    gpu = GPU()
+    gpu.load_geometry(lbne, print_usage=False)
+
+    print '%s track steps/s' % ray_trace(gpu)
+    print '%s photons/s' % propagate(gpu)
diff --git a/camera.py b/camera.py
index 609d9f0..03b07f5 100755
--- a/camera.py
+++ b/camera.py
@@ -136,8 +136,6 @@ class Camera(Thread):
         self.gpu = GPU(self.device_id)
         self.gpu.load_geometry(geometry)
 
-        self.kernels = CUDAFuncs(self.gpu.module, ['ray_trace', 'ray_trace_alpha', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng'])
-
         self.width, self.height = size
 
         pygame.init()
@@ -180,7 +178,7 @@ class Camera(Thread):
     @timeit
     def initialize_render(self):
         self.rng_states_gpu = cuda.mem_alloc(self.width*self.height*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.kernels.init_rng(np.int32(self.width*self.height), self.rng_states_gpu, np.int32(0), np.int32(0), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+        self.gpu.kernels.init_rng(np.int32(self.width*self.height), self.rng_states_gpu, np.int32(0), np.int32(0), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
         self.xyz_lookup1_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3)
         self.xyz_lookup2_gpu = gpuarray.zeros(len(self.geometry.mesh.triangles), dtype=gpuarray.vec.float3)
         self.image_gpu = gpuarray.zeros(self.width*self.height, dtype=gpuarray.vec.float3)
@@ -200,13 +198,13 @@ class Camera(Thread):
 
     def update_xyz_lookup(self, source_position):
         for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1):
-            self.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+            self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
 
         for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1):
-            self.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+            self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
 
         for i in range(self.xyz_lookup1_gpu.size//(self.width*self.height)+1):
-            self.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+            self.gpu.kernels.update_xyz_lookup(np.int32(self.width*self.height), np.int32(self.xyz_lookup1_gpu.size), np.int32(i*self.width*self.height), gpuarray.vec.make_float3(*source_position), self.rng_states_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
 
         self.nlookup_calls += 1
 
@@ -216,16 +214,16 @@ class Camera(Thread):
         self.nimages = 0
 
     def update_image(self):
-        self.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+        self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(685.0), gpuarray.vec.make_float3(1.0,0.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
 
-        self.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+        self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(545.0), gpuarray.vec.make_float3(0.0,1.0,0.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
 
-        self.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
+        self.gpu.kernels.update_xyz_image(np.int32(self.width*self.height), self.rng_states_gpu, self.origins_gpu, self.directions_gpu, np.float32(445.0), gpuarray.vec.make_float3(0.0,0.0,1.0), self.xyz_lookup1_gpu, self.xyz_lookup2_gpu, self.image_gpu, np.int32(self.nlookup_calls), np.int32(self.max_steps), block=(self.nblocks,1,1), grid=(self.width*self.height//self.nblocks+1,1))
 
         self.nimages += 1
 
     def process_image(self):
-        self.kernels.process_image(np.int32(self.width*self.height), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.width*self.height)//self.nblocks+1,1))
+        self.gpu.kernels.process_image(np.int32(self.width*self.height), self.image_gpu, self.pixels_gpu, np.int32(self.nimages), block=(self.nblocks,1,1), grid=((self.width*self.height)//self.nblocks+1,1))
 
     def screenshot(self, dir='', start=0):
         root, ext = 'screenshot', 'png'
@@ -240,8 +238,8 @@ class Camera(Thread):
         print 'image saved to %s' % filename
 
     def rotate(self, phi, n):
-        self.kernels.rotate(np.int32(self.pixels_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
-        self.kernels.rotate(np.int32(self.pixels_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+        self.gpu.kernels.rotate(np.int32(self.pixels_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+        self.gpu.kernels.rotate(np.int32(self.pixels_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
 
         self.point = rotate(self.point, phi, n)
         self.axis1 = rotate(self.axis1, phi, n)
@@ -253,8 +251,8 @@ class Camera(Thread):
         self.update()
 
     def rotate_around_point(self, phi, n, point, redraw=True):
-        self.kernels.rotate_around_point(np.int32(self.origins_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), gpuarray.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.origins_gpu.size//self.nblocks+1,1))
-        self.kernels.rotate(np.int32(self.directions_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.directions_gpu.size//self.nblocks+1,1))
+        self.gpu.kernels.rotate_around_point(np.int32(self.origins_gpu.size), self.origins_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), gpuarray.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.origins_gpu.size//self.nblocks+1,1))
+        self.gpu.kernels.rotate(np.int32(self.directions_gpu.size), self.directions_gpu, np.float32(phi), gpuarray.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.directions_gpu.size//self.nblocks+1,1))
 
         self.axis1 = rotate(self.axis1, phi, n)
         self.axis2 = rotate(self.axis2, phi, n)
@@ -266,7 +264,7 @@ class Camera(Thread):
             self.update()
 
     def translate(self, v, redraw=True):
-        self.kernels.translate(np.int32(self.pixels_gpu.size), self.origins_gpu, gpuarray.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks,1))
+        self.gpu.kernels.translate(np.int32(self.pixels_gpu.size), self.origins_gpu, gpuarray.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks,1))
 
         self.point += v
 
@@ -284,9 +282,9 @@ class Camera(Thread):
             self.process_image()
         else:
             if self.alpha:
-                self.kernels.ray_trace_alpha(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+                self.gpu.kernels.ray_trace_alpha(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
             else:
-                self.kernels.ray_trace(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
+                self.gpu.kernels.ray_trace(np.int32(self.pixels_gpu.size), self.origins_gpu, self.directions_gpu, self.pixels_gpu, block=(self.nblocks,1,1), grid=(self.pixels_gpu.size//self.nblocks+1,1))
 
         pygame.surfarray.blit_array(self.screen, self.pixels_gpu.get().reshape(self.size))
         if self.doom_mode:
@@ -624,4 +622,6 @@ if __name__ == '__main__':
         camera = EventViewer(geometry, options.io_file, size=size)
     else:
         camera = Camera(geometry, size)
+
     camera.start()
+    camera.join()
diff --git a/event.py b/event.py
index 5b93f17..657f414 100644
--- a/event.py
+++ b/event.py
@@ -1,13 +1,30 @@
 import numpy as np
 
+from chroma.sample import uniform_sphere
+
 class Photons(object):
-    def __init__(self, positions, directions, polarizations, times, wavelengths,
-                 last_hit_triangles=None, histories=None):
+    def __init__(self, positions, directions, wavelengths, polarizations=None, times=None, last_hit_triangles=None, histories=None):
         self.positions = positions
+
+        nphotons = len(positions)
+
+        assert len(directions) == nphotons
         self.directions = directions
-        self.polarizations = polarizations
-        self.times = times
+
+        assert len(wavelengths) == nphotons
         self.wavelengths = wavelengths
+
+        if polarizations is not None:
+            self.polarizations = polarizations
+        else:
+            # polarizations are isotropic in plane perpendicular
+            # to the direction vectors
+            polarizations = np.cross(directions, uniform_sphere(nphotons))
+            # normalize polarization vectors
+            polarizations /= np.tile(np.apply_along_axis(np.linalg.norm,1,polarizations),[3,1]).transpose()
+            self.polarizations = np.asarray(polarizations, order='C')
+
+        self.times = times
         self.last_hit_triangles = last_hit_triangles
         self.histories = histories
 
diff --git a/gpu.py b/gpu.py
index 4eb2fab..eeb759e 100644
--- a/gpu.py
+++ b/gpu.py
@@ -18,6 +18,9 @@ import event
 
 cuda.init()
 
+def to_float3(arr):
+    return arr.astype(np.float32).view(gpuarray.vec.float3)
+
 def boolean_argsort(condition):
     '''Returns two arrays of indicies indicating which elements of the
     boolean array `condition` should be exchanged so that all of the
@@ -84,28 +87,31 @@ class CUDAFuncs(object):
             setattr(self, name, cuda_module.get_function(name))
 
 class GPU(object):
-    def __init__(self, device_id=None):
+    def __init__(self, device_id=None, nthreads_per_block=64, max_blocks=1024):
         '''Initialize a GPU context on the specified device.  If device_id==None,
         the default device is used.'''
 
         if device_id is None:
             self.context = make_default_context()
         else:
-            print 'device_id = %s' % device_id
             device = cuda.Device(device_id)
             self.context = device.make_context()
+
         print 'device %s' % self.context.get_device().name()
 
+        self.nthreads_per_block = nthreads_per_block
+        self.max_blocks = max_blocks
+
         self.context.set_cache_config(cuda.func_cache.PREFER_L1)
 
         cuda_options = ['-I' + dirname(chroma.src.__file__), '--use_fast_math', '--ptxas-options=-v']
 
         self.module = SourceModule(chroma.src.kernel, options=cuda_options, no_extern_c=True)
+        self.kernels = CUDAFuncs(self.module, ['ray_trace', 'ray_trace_alpha', 'rotate', 'rotate_around_point', 'translate', 'update_xyz_lookup', 'update_xyz_image', 'process_image', 'init_rng'])
+
         self.geo_funcs = CUDAFuncs(self.module, ['set_wavelength_range', 'set_material', 'set_surface', 'set_global_mesh_variables', 'color_solids'])
 
         self.prop_funcs = CUDAFuncs(self.module, ['init_rng', 'propagate', 'swap'])
-        self.nthread_per_block = 64
-        self.max_blocks = 1024
         self.daq_module = SourceModule(chroma.src.daq, options=cuda_options, no_extern_c=True)
         self.daq_funcs = CUDAFuncs(self.daq_module, 
                               ['reset_earliest_time_int', 'run_daq',
@@ -121,7 +127,7 @@ class GPU(object):
         print format_array('node_map_end', self.node_map_end_gpu)
         print '%-15s %6s %6s' % ('total', '', format_size(self.vertices_gpu.nbytes + self.triangles_gpu.nbytes + self.lower_bounds_gpu.nbytes + self.upper_bounds_gpu.nbytes + self.node_map_gpu.nbytes + self.node_map_end_gpu.nbytes))
 
-    def load_geometry(self, geometry):
+    def load_geometry(self, geometry, print_usage=True):
         if not hasattr(geometry, 'mesh'):
             print 'WARNING: building geometry with 8-bits'
             geometry.build(bits=8)
@@ -159,7 +165,7 @@ class GPU(object):
 
             self.surfaces.append(surface)
 
-        self.vertices_gpu = gpuarray.to_gpu(geometry.mesh.vertices.astype(np.float32).view(gpuarray.vec.float3))
+        self.vertices_gpu = gpuarray.to_gpu(to_float3(geometry.mesh.vertices))
 
         triangles = \
             np.empty(len(geometry.mesh.triangles), dtype=gpuarray.vec.uint4)
@@ -169,17 +175,9 @@ class GPU(object):
         triangles['w'] = ((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)
         self.triangles_gpu = gpuarray.to_gpu(triangles)
 
-        lower_bounds_float3 = np.empty(geometry.lower_bounds.shape[0], dtype=gpuarray.vec.float3)
-        lower_bounds_float3['x'] = geometry.lower_bounds[:,0]
-        lower_bounds_float3['y'] = geometry.lower_bounds[:,1]
-        lower_bounds_float3['z'] = geometry.lower_bounds[:,2]
-        self.lower_bounds_gpu = gpuarray.to_gpu(lower_bounds_float3)
+        self.lower_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.lower_bounds))
 
-        upper_bounds_float3 = np.empty(geometry.upper_bounds.shape[0], dtype=gpuarray.vec.float3)
-        upper_bounds_float3['x'] = geometry.upper_bounds[:,0]
-        upper_bounds_float3['y'] = geometry.upper_bounds[:,1]
-        upper_bounds_float3['z'] = geometry.upper_bounds[:,2]
-        self.upper_bounds_gpu = gpuarray.to_gpu(upper_bounds_float3)
+        self.upper_bounds_gpu = gpuarray.to_gpu(to_float3(geometry.upper_bounds))
 
         self.colors_gpu = gpuarray.to_gpu(geometry.colors.astype(np.uint32))
         self.node_map_gpu = gpuarray.to_gpu(geometry.node_map.astype(np.uint32))
@@ -199,14 +197,11 @@ class GPU(object):
 
         self.geometry = geometry
 
-        print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
-
-        print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
-
-        print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
-
-
-        self.print_device_usage()
+        if print_usage:
+            print 'average of %f child nodes per node' % (np.mean(geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]))
+            print '%i nodes with one child' % (np.count_nonzero((geometry.node_map_end[geometry.first_node:] - geometry.node_map[geometry.first_node:]) == 1))
+            print '%i leaf nodes with one child' % (np.count_nonzero((geometry.node_map_end[:geometry.first_node] - geometry.node_map[:geometry.first_node]) == 1))
+            self.print_device_usage()
 
     def reset_colors(self):
         self.colors_gpu.set_async(self.geometry.colors.astype(np.uint32))
@@ -215,22 +210,22 @@ class GPU(object):
         solid_hit_gpu = gpuarray.to_gpu(np.array(solid_hit, dtype=np.bool))
         solid_colors_gpu = gpuarray.to_gpu(np.array(colors, dtype=np.uint32))
 
-        for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthread_per_block, self.max_blocks):
+        for first_triangle, triangles_this_round, blocks in chunk_iterator(self.triangles_gpu.size, self.nthreads_per_block, self.max_blocks):
             self.geo_funcs.color_solids(np.int32(first_triangle),
                                         np.int32(triangles_this_round),
                                         self.solid_id_map_gpu, 
                                         solid_hit_gpu, 
                                         solid_colors_gpu, 
-                                        block=(self.nthread_per_block,1,1), 
+                                        block=(self.nthreads_per_block,1,1), 
                                         grid=(blocks,1))
         self.context.synchronize()
 
     def setup_propagate(self, seed=1):
-        self.rng_states_gpu = cuda.mem_alloc(self.nthread_per_block * self.max_blocks
+        self.rng_states_gpu = cuda.mem_alloc(self.nthreads_per_block * self.max_blocks
                                              * sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
-        self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthread_per_block), self.rng_states_gpu, 
+        self.prop_funcs.init_rng(np.int32(self.max_blocks * self.nthreads_per_block), self.rng_states_gpu, 
                                  np.uint64(seed), np.uint64(0), 
-                                 block=(self.nthread_per_block,1,1), 
+                                 block=(self.nthreads_per_block,1,1), 
                                  grid=(self.max_blocks,1))
         #self.context.synchronize()
 
@@ -244,13 +239,16 @@ class GPU(object):
         assert len(photons.directions) == self.nphotons
         assert len(photons.polarizations) == self.nphotons
         assert len(photons.wavelengths) == self.nphotons
-        assert len(photons.times) == self.nphotons
 
-        self.positions_gpu = gpuarray.to_gpu(photons.positions.astype(np.float32).view(gpuarray.vec.float3))
-        self.directions_gpu = gpuarray.to_gpu(photons.directions.astype(np.float32).view(gpuarray.vec.float3))
-        self.polarizations_gpu = gpuarray.to_gpu(photons.polarizations.astype(np.float32).view(gpuarray.vec.float3))
+        self.positions_gpu = gpuarray.to_gpu(to_float3(photons.positions))#.astype(np.float32).view(gpuarray.vec.float3))
+        self.directions_gpu = gpuarray.to_gpu(to_float3(photons.directions))#.astype(np.float32).view(gpuarray.vec.float3))
+        self.polarizations_gpu = gpuarray.to_gpu(to_float3(photons.polarizations))#.astype(np.float32).view(gpuarray.vec.float3))
         self.wavelengths_gpu = gpuarray.to_gpu(photons.wavelengths.astype(np.float32))
-        self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
+
+        if photons.times is not None:
+            self.times_gpu = gpuarray.to_gpu(photons.times.astype(np.float32))
+        else:
+            self.times_gpu = gpuarray.zeros(self.nphotons, dtype=np.float32)
 
         if photons.histories is not None:
             self.histories_gpu = gpuarray.to_gpu(photons.histories.astype(np.uint32))
@@ -282,12 +280,12 @@ class GPU(object):
 
         while step < max_steps:
             ## Just finish the rest of the steps if the # of photons is low
-            if nphotons < self.nthread_per_block * 16 * 8:
+            if nphotons < self.nthreads_per_block * 16 * 8:
                 nsteps = max_steps - step
             else:
                 nsteps = 1
 
-            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthread_per_block, self.max_blocks):
+            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, self.nthreads_per_block, self.max_blocks):
                 self.prop_funcs.propagate(np.int32(first_photon),
                                           np.int32(photons_this_round),
                                           input_queue_gpu[1:],
@@ -298,7 +296,7 @@ class GPU(object):
                                           self.times_gpu, self.histories_gpu, 
                                           self.last_hit_triangles_gpu, 
                                           np.int32(nsteps), 
-                                          block=(self.nthread_per_block,1,1),
+                                          block=(self.nthreads_per_block,1,1),
                                           grid=(blocks, 1))
         
             step += nsteps
@@ -341,13 +339,13 @@ class GPU(object):
         self.daq_funcs.reset_earliest_time_int(np.float32(1e9), 
                                                np.int32(len(self.earliest_time_int_gpu)), 
                                                self.earliest_time_int_gpu, 
-                                               block=(self.nthread_per_block,1,1), 
-                                               grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
+                                               block=(self.nthreads_per_block,1,1), 
+                                               grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
 
         #self.context.synchronize()
-        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthread_per_block, self.max_blocks):
+        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, self.nthreads_per_block, self.max_blocks):
             self.daq_funcs.run_daq(self.rng_states_gpu, np.uint32(0x1 << 2), 
                                    np.float32(self.daq_pmt_rms), 
                                    np.int32(first_photon),
@@ -359,13 +357,13 @@ class GPU(object):
                                    self.earliest_time_int_gpu,
                                    self.channel_q_gpu,
                                    self.channel_history_gpu,
-                                   block=(self.nthread_per_block,1,1), grid=(blocks,1))
+                                   block=(self.nthreads_per_block,1,1), grid=(blocks,1))
         #self.context.synchronize()
         self.daq_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), 
                                                      self.earliest_time_int_gpu, 
                                                      self.earliest_time_gpu, 
-                                                     block=(self.nthread_per_block,1,1), 
-                                                     grid=(len(self.earliest_time_int_gpu)//self.nthread_per_block+1,1))
+                                                     block=(self.nthreads_per_block,1,1), 
+                                                     grid=(len(self.earliest_time_int_gpu)//self.nthreads_per_block+1,1))
         if 'profile' in __builtins__:
             self.context.synchronize()