move photon initialization to the gpu. it's unclear if this is a speed improvement.

author: Anthony LaTorre <tlatorre9@gmail.com> 2011-06-23 19:26:24 -0400
committer: Anthony LaTorre <tlatorre9@gmail.com> 2011-06-23 19:26:24 -0400
commit: ac9f568017e9a7cf8da04acdacc342b4db6576fa (patch)
tree: a0978984be2d045781fa3603376b7d52e065b5a6
parent: a7b94c352cc70fbc79d78e3ca6cec334aec2e258 (diff)
download: chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.tar.gz
chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.tar.bz2
chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.zip
5 files changed, 169 insertions, 180 deletions
diff --git a/gputhread.py b/gputhread.py
index 8258f63..ba0bb4b 100644
--- a/gputhread.py
+++ b/gputhread.py
@@ -1,60 +1,15 @@
 import numpy as np
 import time
 import pycuda.driver as cuda
+from pycuda.characterize import sizeof
 from pycuda.compiler import SourceModule
 from pycuda import gpuarray
 import threading
 import Queue
 import src
 
-class Job(object):
-    def __init__(self, positions, directions, wavelengths, polarizations,
-                 times, states, last_hit_triangles, max_steps=100):
-        if positions.dtype is not gpuarray.vec.float3:
-            if len(positions.shape) != 2 or positions.shape[-1] != 3:
-                raise Exception('shape mismatch')
-
-            self.positions = np.empty(positions.shape[0], gpuarray.vec.float3)
-            self.positions['x'] = positions[:,0]
-            self.positions['y'] = positions[:,1]
-            self.positions['z'] = positions[:,2]
-        else:
-            self.positions = positions
-
-        if directions.dtype is not gpuarray.vec.float3:
-            if len(directions.shape) != 2 or directions.shape[-1] != 3:
-                raise Exception('shape mismatch')
-
-            self.directions = \
-                np.empty(directions.shape[0], gpuarray.vec.float3)
-            self.directions['x'] = directions[:,0]
-            self.directions['y'] = directions[:,1]
-            self.directions['z'] = directions[:,2]
-        else:
-            self.directions = directions
-
-        if polarizations.dtype is not gpuarray.vec.float3:
-            if len(polarizations.shape) != 2 or polarizations.shape[-1] != 3:
-                raise Exception('shape mismatch')
-
-            self.polarizations = \
-                np.empty(polarizations.shape[0], gpuarray.vec.float3)
-            self.polarizations['x'] = polarizations[:,0]
-            self.polarizations['y'] = polarizations[:,1]
-            self.polarizations['z'] = polarizations[:,2]
-        else:
-            self.polarizations = polarizations
-
-        self.wavelengths = np.asarray(wavelengths, dtype=np.float32)
-        self.times = np.asarray(times, dtype=np.float32)
-        self.states = np.asarray(states, dtype=np.int32)
-        self.last_hit_triangles = \
-            np.asarray(last_hit_triangles, dtype=np.int32)
-
-        self.max_steps = max_steps
-
 class GPUThread(threading.Thread):
-    def __init__(self, device_id, geometry, jobs, output, nblocks=64):
+    def __init__(self, device_id, geometry, jobs, output, nblocks=64, max_nthreads=100000):
         threading.Thread.__init__(self)
 
         self.device_id = device_id
@@ -62,6 +17,7 @@ class GPUThread(threading.Thread):
         self.jobs = jobs
         self.output = output
         self.nblocks = nblocks
+        self.max_nthreads = max_nthreads
         self._stop = threading.Event()
 
     def stop(self):
@@ -74,14 +30,20 @@ class GPUThread(threading.Thread):
         device = cuda.Device(self.device_id)
         context = device.make_context()
         module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
+
         propagate = module.get_function('propagate')
+        fill_float = module.get_function('fill_float')
+        fill_float3 = module.get_function('fill_float3')
+        fill_uniform = module.get_function('fill_uniform')
+        fill_uniform_sphere = module.get_function('fill_uniform_sphere')
+
         init_rng = module.get_function('init_rng')
+        rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
+        init_rng(np.int32(self.max_nthreads), rng_states_gpu, np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(self.max_nthreads//self.nblocks+1,1))
+
         texrefs = self.geometry.load(module)
-        
 
-        daq_module = SourceModule(src.daq, options=['-I' + src.dir],
-                                  no_extern_c=True, cache_dir=False)
-        init_daq_rng = daq_module.get_function('init_daq_rng')
+        daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False)
         reset_earliest_time_int = daq_module.get_function('reset_earliest_time_int')
         run_daq = daq_module.get_function('run_daq')
         convert_sortable_int_to_float = daq_module.get_function('convert_sortable_int_to_float')
@@ -91,54 +53,46 @@ class GPUThread(threading.Thread):
 
         solid_map_gpu = gpuarray.to_gpu(self.geometry.solid_id.astype(np.int32))
         
-        init_rng(np.int32(100000), np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1))
-        init_daq_rng(np.int32(100000), np.int32(10000+self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1))
-
         while not self.stopped():
             try:
-                job = self.jobs.get(block=False, timeout=0.01)
+                position, nphotons = self.jobs.get(block=False, timeout=0.1)
             except Queue.Empty:
                 continue
 
-            positions_gpu = cuda.to_device(job.positions)
-            directions_gpu = cuda.to_device(job.directions)
-            polarizations_gpu = cuda.to_device(job.polarizations)
-            wavelengths_gpu = cuda.to_device(job.wavelengths)
-            times_gpu = cuda.to_device(job.times)
-            states_gpu = cuda.to_device(job.states)
-            last_hit_triangles_gpu = cuda.to_device(job.last_hit_triangles)
-            
-            nphotons = len(job.positions)
-
             t0 = time.time()
-            propagate(np.int32(nphotons), positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(job.max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
-
-            reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu,
-                                    block=(self.nblocks,1,1), 
-                                    grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
-            run_daq(np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu,
-                    last_hit_triangles_gpu, solid_map_gpu,
-                    np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu,
-                    block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
-            convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu,
-                                          earliest_time_gpu,
-                                          block=(self.nblocks,1,1), 
-                                          grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
 
+            gpu_kwargs = {'block': (self.nblocks,1,1), 'grid': (nphotons//self.nblocks+1,1)}
+
+            positions_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons)
+            fill_float3(np.int32(nphotons), positions_gpu, position, **gpu_kwargs)
+            directions_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons)
+            fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, directions_gpu, **gpu_kwargs)
+            wavelengths_gpu = cuda.mem_alloc(np.dtype(np.float32).itemsize*nphotons)
+            fill_uniform(np.int32(nphotons), rng_states_gpu, wavelengths_gpu, np.float32(200), np.float32(800), **gpu_kwargs)
+            polarizations_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons)
+            fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, polarizations_gpu, **gpu_kwargs)
+            times_gpu = cuda.mem_alloc(np.dtype(np.float32).itemsize*nphotons)
+            fill_float(np.int32(nphotons), times_gpu, np.float32(0), **gpu_kwargs)
+            states_gpu = cuda.mem_alloc(np.dtype(np.int32).itemsize*nphotons)
+            fill_float(np.int32(nphotons), states_gpu, np.int32(-1), **gpu_kwargs)
+            last_hit_triangles_gpu = cuda.mem_alloc(np.dtype(np.int32).itemsize*nphotons)
+            fill_float(np.int32(nphotons), last_hit_triangles_gpu, np.int32(-1), **gpu_kwargs)
+
+            max_steps = 10
+
+            propagate(np.int32(nphotons), rng_states_gpu, positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
+
+            reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
+
+            run_daq(rng_states_gpu, np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu, last_hit_triangles_gpu, solid_map_gpu, np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1))
+
+            convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, earliest_time_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1))
             cuda.Context.synchronize()
             elapsed = time.time() - t0
             
             #print 'device %i; elapsed %f sec' % (self.device_id, elapsed)
-            cuda.memcpy_dtoh(job.positions, positions_gpu)
-            cuda.memcpy_dtoh(job.directions, directions_gpu)
-            cuda.memcpy_dtoh(job.wavelengths, wavelengths_gpu)
-            cuda.memcpy_dtoh(job.polarizations, polarizations_gpu)
-            cuda.memcpy_dtoh(job.times, times_gpu)
-            cuda.memcpy_dtoh(job.states, states_gpu)
-            cuda.memcpy_dtoh(job.last_hit_triangles, last_hit_triangles_gpu)
-            job.earliest_times = earliest_time_gpu.get()
-
-            self.output.put(job)
+
+            self.output.put(earliest_time_gpu.get())
             self.jobs.task_done()
 
         context.pop()
diff --git a/src/daq.cu b/src/daq.cu
index 99490bd..a7f5120 100644
--- a/src/daq.cu
+++ b/src/daq.cu
@@ -16,65 +16,70 @@ __device__ float sortable_int_to_float(unsigned int i)
   //return __int_as_float(i ^ mask);
 }
 
+extern "C" {
 
-__device__ curandState daq_rng_states[100000];
+__global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints)
+{
+	int id = threadIdx.x + blockDim.x * blockIdx.x;
+	if (id < ntime_ints) {
+		unsigned int maxtime_int = float_to_sortable_int(maxtime);
+		time_ints[id] = maxtime_int;
+	}
+}
+
+__global__ void run_daq(curandState *s, int detection_state, float time_rms,
+			int nphotons, float *photon_times, int *photon_states,
+			int *last_hit_triangles, int *solid_map,
+			int nsolids, unsigned int *earliest_time_int)
+{
+
+	int id = threadIdx.x + blockDim.x * blockIdx.x;
+
+	if (id < nphotons)
+	{
+		curandState rng = s[id];
+
+
+
+		int triangle_id = last_hit_triangles[id];
+		
+		if (triangle_id > -1)
+		{
+			int solid_id = solid_map[triangle_id];
+			int state = photon_states[id];
+			float time = photon_times[id] + curand_normal(&rng) * time_rms;
+			unsigned int time_int = float_to_sortable_int(time);
+
+
+
+			if (solid_id < nsolids && state == detection_state) {
+			    atomicMin(earliest_time_int + solid_id, time_int);
+			}
 
-extern "C" {
 
-  __global__ void init_daq_rng(int nthreads, 
-			  unsigned long long seed, unsigned long long offset)
-  {
-    int id = blockIdx.x*blockDim.x + threadIdx.x;
-    
-    if (id >= nthreads)
-      return;
-    
-    curand_init(seed, id, offset, daq_rng_states+id);
-  }
-
-  __global__ void reset_earliest_time_int(float maxtime, 
-				     int ntime_ints, unsigned int *time_ints)
-  {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < ntime_ints) {
-      unsigned int maxtime_int = float_to_sortable_int(maxtime);
-      time_ints[id] = maxtime_int;
-    }
-  }
-
-  __global__ void run_daq(int detection_state, float time_rms,
-		     int nphotons, float *photon_times, int *photon_states,
-		     int *last_hit_triangles, int *solid_map,
-		     int nsolids, unsigned int *earliest_time_int)
-  {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    
-    curandState_t rng = daq_rng_states[id];
-
-    if (id < nphotons) {
-      int triangle_id = last_hit_triangles[id];
-      
-      if (triangle_id > -1) {
-	int solid_id = solid_map[triangle_id];
-	int state = photon_states[id];
-	float time = photon_times[id] + curand_normal(&rng) * time_rms;
-	unsigned int time_int = float_to_sortable_int(time);
-	if (solid_id < nsolids && state == detection_state)
-	  atomicMin(earliest_time_int + solid_id, time_int);
-      }
-    }
-
-    daq_rng_states[id] = rng;
-  }
-
-  __global__ void convert_sortable_int_to_float(int n, 
-						unsigned int *sortable_ints, 
-						float *float_output)
-  {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    
-    if (id < n)
-      float_output[id] = sortable_int_to_float(sortable_ints[id]);
-  }
+		}
+
+
+
+		s[id] = rng;
+
+
+
+	}
+
+
+
+
+}
+
+__global__ void convert_sortable_int_to_float(int n, 
+					      unsigned int *sortable_ints, 
+					      float *float_output)
+{
+	int id = threadIdx.x + blockDim.x * blockIdx.x;
+	
+	if (id < n)
+		float_output[id] = sortable_int_to_float(sortable_ints[id]);
+}
 
 } // extern "C"
diff --git a/src/kernel.cu b/src/kernel.cu
index 584c9e4..55fde70 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -141,26 +141,33 @@ __device__ int intersect_mesh(const float3 &origin, const float3& direction, con
 	return triangle_index;
 }
 
-__device__ curandState rng_states[100000];
-
 extern "C"
 {
 
-  __global__ void set_pointer(uint4 *triangle_ptr, float3 *vertex_ptr)
+__global__ void fill_float(int nthreads, float *a, float value)
 {
-	triangles = triangle_ptr;
-	vertices = vertex_ptr;
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	a[id] = value;
 }
 
-/* Initialize random number states */
-__global__ void init_rng(int nthreads, unsigned long long seed, unsigned long long offset)
+__global__ void fill_float3(int nthreads, float3 *a, float3 value)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (id >= nthreads)
 		return;
 
-	curand_init(seed, id, offset, rng_states+id);
+	a[id] = value;
+}
+
+__global__ void set_pointer(uint4 *triangle_ptr, float3 *vertex_ptr)
+{
+	triangles = triangle_ptr;
+	vertices = vertex_ptr;
 }
 
 /* Translate `points` by the vector `v` */
@@ -223,7 +230,7 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i
 
 } // ray_trace
 
-__global__ void propagate(int nthreads, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, int *states, int *last_hit_triangles, int start_node, int first_node, int max_steps)
+__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, int *states, int *last_hit_triangles, int start_node, int first_node, int max_steps)
 {
 	int id = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -445,8 +452,8 @@ __global__ void propagate(int nthreads, float3 *positions, float3 *directions, f
 
 	} // while(nsteps < max_steps)
 
-	states[id] = state;
 	rng_states[id] = rng;
+	states[id] = state;
 	positions[id] = position;
 	directions[id] = direction;
 	polarizations[id] = polarization;
diff --git a/src/random.h b/src/random.h
index 74329da..b084c19 100644
--- a/src/random.h
+++ b/src/random.h
@@ -5,18 +5,54 @@
 
 #include "physical_constants.h"
 
-__device__ float uniform(curandState *rng, float a=0.0, float b=1.0)
+__device__ float uniform(curandState *s, const float &low, const float &high)
 {
-	return a + curand_uniform(rng)*(b-a);
+	return low + curand_uniform(s)*(high-low);
 }
 
-__device__ float3 uniform_sphere(curandState *rng)
+__device__ float3 uniform_sphere(curandState *s)
 {
-	float theta = uniform(rng, 0, 2*PI);
-	float u = uniform(rng, -1, 1);
-	float c = sqrtf(1-u*u);
+	float theta = uniform(s, 0.0f, 2*PI);
+	float u = uniform(s, -1.0f, 1.0f);
+	float c = sqrtf(1.0f-u*u);
 
 	return make_float3(c*cosf(theta), c*sinf(theta), u);
 }
 
+extern "C"
+{
+
+__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	curand_init(seed, id, offset, &s[id]);
+}
+
+__global__ void fill_uniform(int nthreads, curandState *s, float *a, float low, float high)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	a[id] = uniform(&s[id], low, high);
+
+}
+
+__global__ void fill_uniform_sphere(int nthreads, curandState *s, float3 *a)
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (id >= nthreads)
+		return;
+
+	a[id] = uniform_sphere(&s[id]);
+}
+
+} // extern "c"
+
 #endif
diff --git a/threadtest.py b/threadtest.py
index c9854a7..6cc9c07 100644
--- a/threadtest.py
+++ b/threadtest.py
@@ -11,44 +11,31 @@ import math
 jobs = Queue()
 output = Queue()
 
-def create_job(position=(0,0,0), nphotons=5000, max_steps=10):
-    positions = np.tile(position, nphotons).reshape((nphotons, 3))
-    directions = uniform_sphere(nphotons)
-    polarizations = uniform_sphere(nphotons)
-    wavelengths = np.random.uniform(200, 800, size=nphotons)
-    times = np.zeros(nphotons)
-    states = -np.ones(nphotons)
-    last_hit_triangles = -np.ones(nphotons)
-
-    return Job(positions, directions, wavelengths, polarizations, times,
-               states, last_hit_triangles, max_steps)
-
 def generate_event(detector, position=(0,0,0), nphotons=5000):
-    jobs.put(create_job(position, nphotons))
+    jobs.put((gpuarray.vec.make_float3(*position), nphotons))
     jobs.join()
 
-    job = output.get()
+    earliest_times = output.get()
 
-    pmt_times = job.earliest_times[detector.pmtids]
+    pmt_times = earliest_times[detector.pmtids]
     event_times = [ (i, t) for i, t in zip(detector.pmtids, pmt_times) if t < 1e8 ]
     print '%i hit pmts' % len(event_times)
     return event_times
 
 def likelihood(detector, event_times, position=(0,0,0), nphotons=5000, neval=100):
     for i in range(neval):
-        jobs.put(create_job(position, nphotons))
+        jobs.put((gpuarray.vec.make_float3(*position), nphotons))
     jobs.join()
 
     t = np.zeros(shape=(neval, max(detector.pmtids)+1), dtype=np.float32)
     for i in range(neval):
-        job = output.get()
-        t[i] = job.earliest_times
+        t[i] = output.get()
 
     log_likelihood = 0.0
     log_likelihood_variance = 0.0
     for i, time in event_times:
         h = Histogram(500, (-0.5e-9, 99.5e-9))
-        h.fill(t[:,i])
+        h.fill(t[:,i][t[:,i] < 1e8])
 
         if h.nentries > 0:
             h.normalize()
author	Anthony LaTorre <tlatorre9@gmail.com>	2011-06-23 19:26:24 -0400
committer	Anthony LaTorre <tlatorre9@gmail.com>	2011-06-23 19:26:24 -0400
commit	ac9f568017e9a7cf8da04acdacc342b4db6576fa (patch)
tree	a0978984be2d045781fa3603376b7d52e065b5a6
parent	a7b94c352cc70fbc79d78e3ca6cec334aec2e258 (diff)
download	chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.tar.gz chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.tar.bz2 chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.zip