diff options
author | Anthony LaTorre <tlatorre9@gmail.com> | 2011-06-23 19:26:24 -0400 |
---|---|---|
committer | Anthony LaTorre <tlatorre9@gmail.com> | 2011-06-23 19:26:24 -0400 |
commit | ac9f568017e9a7cf8da04acdacc342b4db6576fa (patch) | |
tree | a0978984be2d045781fa3603376b7d52e065b5a6 | |
parent | a7b94c352cc70fbc79d78e3ca6cec334aec2e258 (diff) | |
download | chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.tar.gz chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.tar.bz2 chroma-ac9f568017e9a7cf8da04acdacc342b4db6576fa.zip |
move photon initialization to the gpu. it's unclear if this is a speed improvement.
-rw-r--r-- | gputhread.py | 130 | ||||
-rw-r--r-- | src/daq.cu | 119 | ||||
-rw-r--r-- | src/kernel.cu | 27 | ||||
-rw-r--r-- | src/random.h | 48 | ||||
-rw-r--r-- | threadtest.py | 25 |
5 files changed, 169 insertions, 180 deletions
diff --git a/gputhread.py b/gputhread.py index 8258f63..ba0bb4b 100644 --- a/gputhread.py +++ b/gputhread.py @@ -1,60 +1,15 @@ import numpy as np import time import pycuda.driver as cuda +from pycuda.characterize import sizeof from pycuda.compiler import SourceModule from pycuda import gpuarray import threading import Queue import src -class Job(object): - def __init__(self, positions, directions, wavelengths, polarizations, - times, states, last_hit_triangles, max_steps=100): - if positions.dtype is not gpuarray.vec.float3: - if len(positions.shape) != 2 or positions.shape[-1] != 3: - raise Exception('shape mismatch') - - self.positions = np.empty(positions.shape[0], gpuarray.vec.float3) - self.positions['x'] = positions[:,0] - self.positions['y'] = positions[:,1] - self.positions['z'] = positions[:,2] - else: - self.positions = positions - - if directions.dtype is not gpuarray.vec.float3: - if len(directions.shape) != 2 or directions.shape[-1] != 3: - raise Exception('shape mismatch') - - self.directions = \ - np.empty(directions.shape[0], gpuarray.vec.float3) - self.directions['x'] = directions[:,0] - self.directions['y'] = directions[:,1] - self.directions['z'] = directions[:,2] - else: - self.directions = directions - - if polarizations.dtype is not gpuarray.vec.float3: - if len(polarizations.shape) != 2 or polarizations.shape[-1] != 3: - raise Exception('shape mismatch') - - self.polarizations = \ - np.empty(polarizations.shape[0], gpuarray.vec.float3) - self.polarizations['x'] = polarizations[:,0] - self.polarizations['y'] = polarizations[:,1] - self.polarizations['z'] = polarizations[:,2] - else: - self.polarizations = polarizations - - self.wavelengths = np.asarray(wavelengths, dtype=np.float32) - self.times = np.asarray(times, dtype=np.float32) - self.states = np.asarray(states, dtype=np.int32) - self.last_hit_triangles = \ - np.asarray(last_hit_triangles, dtype=np.int32) - - self.max_steps = max_steps - class GPUThread(threading.Thread): - def __init__(self, device_id, geometry, jobs, output, nblocks=64): + def __init__(self, device_id, geometry, jobs, output, nblocks=64, max_nthreads=100000): threading.Thread.__init__(self) self.device_id = device_id @@ -62,6 +17,7 @@ class GPUThread(threading.Thread): self.jobs = jobs self.output = output self.nblocks = nblocks + self.max_nthreads = max_nthreads self._stop = threading.Event() def stop(self): @@ -74,14 +30,20 @@ class GPUThread(threading.Thread): device = cuda.Device(self.device_id) context = device.make_context() module = SourceModule(src.kernel, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) + propagate = module.get_function('propagate') + fill_float = module.get_function('fill_float') + fill_float3 = module.get_function('fill_float3') + fill_uniform = module.get_function('fill_uniform') + fill_uniform_sphere = module.get_function('fill_uniform_sphere') + init_rng = module.get_function('init_rng') + rng_states_gpu = cuda.mem_alloc(self.max_nthreads*sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) + init_rng(np.int32(self.max_nthreads), rng_states_gpu, np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(self.max_nthreads//self.nblocks+1,1)) + texrefs = self.geometry.load(module) - - daq_module = SourceModule(src.daq, options=['-I' + src.dir], - no_extern_c=True, cache_dir=False) - init_daq_rng = daq_module.get_function('init_daq_rng') + daq_module = SourceModule(src.daq, options=['-I' + src.dir], no_extern_c=True, cache_dir=False) reset_earliest_time_int = daq_module.get_function('reset_earliest_time_int') run_daq = daq_module.get_function('run_daq') convert_sortable_int_to_float = daq_module.get_function('convert_sortable_int_to_float') @@ -91,54 +53,46 @@ class GPUThread(threading.Thread): solid_map_gpu = gpuarray.to_gpu(self.geometry.solid_id.astype(np.int32)) - init_rng(np.int32(100000), np.int32(self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1)) - init_daq_rng(np.int32(100000), np.int32(10000+self.device_id), np.int32(0), block=(self.nblocks,1,1), grid=(100000//self.nblocks+1,1)) - while not self.stopped(): try: - job = self.jobs.get(block=False, timeout=0.01) + position, nphotons = self.jobs.get(block=False, timeout=0.1) except Queue.Empty: continue - positions_gpu = cuda.to_device(job.positions) - directions_gpu = cuda.to_device(job.directions) - polarizations_gpu = cuda.to_device(job.polarizations) - wavelengths_gpu = cuda.to_device(job.wavelengths) - times_gpu = cuda.to_device(job.times) - states_gpu = cuda.to_device(job.states) - last_hit_triangles_gpu = cuda.to_device(job.last_hit_triangles) - - nphotons = len(job.positions) - t0 = time.time() - propagate(np.int32(nphotons), positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(job.max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) - - reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, - block=(self.nblocks,1,1), - grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) - run_daq(np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu, - last_hit_triangles_gpu, solid_map_gpu, - np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, - block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) - convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, - earliest_time_gpu, - block=(self.nblocks,1,1), - grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) + gpu_kwargs = {'block': (self.nblocks,1,1), 'grid': (nphotons//self.nblocks+1,1)} + + positions_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons) + fill_float3(np.int32(nphotons), positions_gpu, position, **gpu_kwargs) + directions_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons) + fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, directions_gpu, **gpu_kwargs) + wavelengths_gpu = cuda.mem_alloc(np.dtype(np.float32).itemsize*nphotons) + fill_uniform(np.int32(nphotons), rng_states_gpu, wavelengths_gpu, np.float32(200), np.float32(800), **gpu_kwargs) + polarizations_gpu = cuda.mem_alloc(np.dtype(gpuarray.vec.float3).itemsize*nphotons) + fill_uniform_sphere(np.int32(nphotons), rng_states_gpu, polarizations_gpu, **gpu_kwargs) + times_gpu = cuda.mem_alloc(np.dtype(np.float32).itemsize*nphotons) + fill_float(np.int32(nphotons), times_gpu, np.float32(0), **gpu_kwargs) + states_gpu = cuda.mem_alloc(np.dtype(np.int32).itemsize*nphotons) + fill_float(np.int32(nphotons), states_gpu, np.int32(-1), **gpu_kwargs) + last_hit_triangles_gpu = cuda.mem_alloc(np.dtype(np.int32).itemsize*nphotons) + fill_float(np.int32(nphotons), last_hit_triangles_gpu, np.int32(-1), **gpu_kwargs) + + max_steps = 10 + + propagate(np.int32(nphotons), rng_states_gpu, positions_gpu, directions_gpu, wavelengths_gpu, polarizations_gpu, times_gpu, states_gpu, last_hit_triangles_gpu, np.int32(self.geometry.node_map.size-1), np.int32(self.geometry.first_node), np.int32(max_steps), block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) + + reset_earliest_time_int(np.float32(1e9), np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) + + run_daq(rng_states_gpu, np.int32(2), np.float32(1.2e-9), np.int32(nphotons), times_gpu, states_gpu, last_hit_triangles_gpu, solid_map_gpu, np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, block=(self.nblocks,1,1), grid=(nphotons//self.nblocks+1,1)) + + convert_sortable_int_to_float(np.int32(len(earliest_time_int_gpu)), earliest_time_int_gpu, earliest_time_gpu, block=(self.nblocks,1,1), grid=(len(earliest_time_int_gpu)//self.nblocks+1,1)) cuda.Context.synchronize() elapsed = time.time() - t0 #print 'device %i; elapsed %f sec' % (self.device_id, elapsed) - cuda.memcpy_dtoh(job.positions, positions_gpu) - cuda.memcpy_dtoh(job.directions, directions_gpu) - cuda.memcpy_dtoh(job.wavelengths, wavelengths_gpu) - cuda.memcpy_dtoh(job.polarizations, polarizations_gpu) - cuda.memcpy_dtoh(job.times, times_gpu) - cuda.memcpy_dtoh(job.states, states_gpu) - cuda.memcpy_dtoh(job.last_hit_triangles, last_hit_triangles_gpu) - job.earliest_times = earliest_time_gpu.get() - - self.output.put(job) + + self.output.put(earliest_time_gpu.get()) self.jobs.task_done() context.pop() @@ -16,65 +16,70 @@ __device__ float sortable_int_to_float(unsigned int i) //return __int_as_float(i ^ mask); } +extern "C" { -__device__ curandState daq_rng_states[100000]; +__global__ void reset_earliest_time_int(float maxtime, int ntime_ints, unsigned int *time_ints) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < ntime_ints) { + unsigned int maxtime_int = float_to_sortable_int(maxtime); + time_ints[id] = maxtime_int; + } +} + +__global__ void run_daq(curandState *s, int detection_state, float time_rms, + int nphotons, float *photon_times, int *photon_states, + int *last_hit_triangles, int *solid_map, + int nsolids, unsigned int *earliest_time_int) +{ + + int id = threadIdx.x + blockDim.x * blockIdx.x; + + if (id < nphotons) + { + curandState rng = s[id]; + + + + int triangle_id = last_hit_triangles[id]; + + if (triangle_id > -1) + { + int solid_id = solid_map[triangle_id]; + int state = photon_states[id]; + float time = photon_times[id] + curand_normal(&rng) * time_rms; + unsigned int time_int = float_to_sortable_int(time); + + + + if (solid_id < nsolids && state == detection_state) { + atomicMin(earliest_time_int + solid_id, time_int); + } -extern "C" { - __global__ void init_daq_rng(int nthreads, - unsigned long long seed, unsigned long long offset) - { - int id = blockIdx.x*blockDim.x + threadIdx.x; - - if (id >= nthreads) - return; - - curand_init(seed, id, offset, daq_rng_states+id); - } - - __global__ void reset_earliest_time_int(float maxtime, - int ntime_ints, unsigned int *time_ints) - { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < ntime_ints) { - unsigned int maxtime_int = float_to_sortable_int(maxtime); - time_ints[id] = maxtime_int; - } - } - - __global__ void run_daq(int detection_state, float time_rms, - int nphotons, float *photon_times, int *photon_states, - int *last_hit_triangles, int *solid_map, - int nsolids, unsigned int *earliest_time_int) - { - int id = threadIdx.x + blockDim.x * blockIdx.x; - - curandState_t rng = daq_rng_states[id]; - - if (id < nphotons) { - int triangle_id = last_hit_triangles[id]; - - if (triangle_id > -1) { - int solid_id = solid_map[triangle_id]; - int state = photon_states[id]; - float time = photon_times[id] + curand_normal(&rng) * time_rms; - unsigned int time_int = float_to_sortable_int(time); - if (solid_id < nsolids && state == detection_state) - atomicMin(earliest_time_int + solid_id, time_int); - } - } - - daq_rng_states[id] = rng; - } - - __global__ void convert_sortable_int_to_float(int n, - unsigned int *sortable_ints, - float *float_output) - { - int id = threadIdx.x + blockDim.x * blockIdx.x; - - if (id < n) - float_output[id] = sortable_int_to_float(sortable_ints[id]); - } + } + + + + s[id] = rng; + + + + } + + + + +} + +__global__ void convert_sortable_int_to_float(int n, + unsigned int *sortable_ints, + float *float_output) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + + if (id < n) + float_output[id] = sortable_int_to_float(sortable_ints[id]); +} } // extern "C" diff --git a/src/kernel.cu b/src/kernel.cu index 584c9e4..55fde70 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -141,26 +141,33 @@ __device__ int intersect_mesh(const float3 &origin, const float3& direction, con return triangle_index; } -__device__ curandState rng_states[100000]; - extern "C" { - __global__ void set_pointer(uint4 *triangle_ptr, float3 *vertex_ptr) +__global__ void fill_float(int nthreads, float *a, float value) { - triangles = triangle_ptr; - vertices = vertex_ptr; + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + a[id] = value; } -/* Initialize random number states */ -__global__ void init_rng(int nthreads, unsigned long long seed, unsigned long long offset) +__global__ void fill_float3(int nthreads, float3 *a, float3 value) { int id = blockIdx.x*blockDim.x + threadIdx.x; if (id >= nthreads) return; - curand_init(seed, id, offset, rng_states+id); + a[id] = value; +} + +__global__ void set_pointer(uint4 *triangle_ptr, float3 *vertex_ptr) +{ + triangles = triangle_ptr; + vertices = vertex_ptr; } /* Translate `points` by the vector `v` */ @@ -223,7 +230,7 @@ __global__ void ray_trace(int nthreads, float3 *positions, float3 *directions, i } // ray_trace -__global__ void propagate(int nthreads, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, int *states, int *last_hit_triangles, int start_node, int first_node, int max_steps) +__global__ void propagate(int nthreads, curandState *rng_states, float3 *positions, float3 *directions, float *wavelengths, float3 *polarizations, float *times, int *states, int *last_hit_triangles, int start_node, int first_node, int max_steps) { int id = blockIdx.x*blockDim.x + threadIdx.x; @@ -445,8 +452,8 @@ __global__ void propagate(int nthreads, float3 *positions, float3 *directions, f } // while(nsteps < max_steps) - states[id] = state; rng_states[id] = rng; + states[id] = state; positions[id] = position; directions[id] = direction; polarizations[id] = polarization; diff --git a/src/random.h b/src/random.h index 74329da..b084c19 100644 --- a/src/random.h +++ b/src/random.h @@ -5,18 +5,54 @@ #include "physical_constants.h" -__device__ float uniform(curandState *rng, float a=0.0, float b=1.0) +__device__ float uniform(curandState *s, const float &low, const float &high) { - return a + curand_uniform(rng)*(b-a); + return low + curand_uniform(s)*(high-low); } -__device__ float3 uniform_sphere(curandState *rng) +__device__ float3 uniform_sphere(curandState *s) { - float theta = uniform(rng, 0, 2*PI); - float u = uniform(rng, -1, 1); - float c = sqrtf(1-u*u); + float theta = uniform(s, 0.0f, 2*PI); + float u = uniform(s, -1.0f, 1.0f); + float c = sqrtf(1.0f-u*u); return make_float3(c*cosf(theta), c*sinf(theta), u); } +extern "C" +{ + +__global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + curand_init(seed, id, offset, &s[id]); +} + +__global__ void fill_uniform(int nthreads, curandState *s, float *a, float low, float high) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + a[id] = uniform(&s[id], low, high); + +} + +__global__ void fill_uniform_sphere(int nthreads, curandState *s, float3 *a) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + + if (id >= nthreads) + return; + + a[id] = uniform_sphere(&s[id]); +} + +} // extern "c" + #endif diff --git a/threadtest.py b/threadtest.py index c9854a7..6cc9c07 100644 --- a/threadtest.py +++ b/threadtest.py @@ -11,44 +11,31 @@ import math jobs = Queue() output = Queue() -def create_job(position=(0,0,0), nphotons=5000, max_steps=10): - positions = np.tile(position, nphotons).reshape((nphotons, 3)) - directions = uniform_sphere(nphotons) - polarizations = uniform_sphere(nphotons) - wavelengths = np.random.uniform(200, 800, size=nphotons) - times = np.zeros(nphotons) - states = -np.ones(nphotons) - last_hit_triangles = -np.ones(nphotons) - - return Job(positions, directions, wavelengths, polarizations, times, - states, last_hit_triangles, max_steps) - def generate_event(detector, position=(0,0,0), nphotons=5000): - jobs.put(create_job(position, nphotons)) + jobs.put((gpuarray.vec.make_float3(*position), nphotons)) jobs.join() - job = output.get() + earliest_times = output.get() - pmt_times = job.earliest_times[detector.pmtids] + pmt_times = earliest_times[detector.pmtids] event_times = [ (i, t) for i, t in zip(detector.pmtids, pmt_times) if t < 1e8 ] print '%i hit pmts' % len(event_times) return event_times def likelihood(detector, event_times, position=(0,0,0), nphotons=5000, neval=100): for i in range(neval): - jobs.put(create_job(position, nphotons)) + jobs.put((gpuarray.vec.make_float3(*position), nphotons)) jobs.join() t = np.zeros(shape=(neval, max(detector.pmtids)+1), dtype=np.float32) for i in range(neval): - job = output.get() - t[i] = job.earliest_times + t[i] = output.get() log_likelihood = 0.0 log_likelihood_variance = 0.0 for i, time in event_times: h = Histogram(500, (-0.5e-9, 99.5e-9)) - h.fill(t[:,i]) + h.fill(t[:,i][t[:,i] < 1e8]) if h.nentries > 0: h.normalize() |