import numpy as np import pytools import pycuda.tools from pycuda import characterize import pycuda.driver as cuda import pycuda.compiler from pycuda import gpuarray as ga from chroma.cuda import srcdir # standard nvcc options cuda_options = ('--use_fast_math',)#, '--ptxas-options=-v'] @pycuda.tools.context_dependent_memoize def get_cu_module(name, options=None, include_source_directory=True): """Returns a pycuda.compiler.SourceModule object from a CUDA source file located in the chroma cuda directory at cuda/[name].""" if options is None: options = [] elif isinstance(options, tuple): options = list(options) else: raise TypeError('`options` must be a tuple.') if include_source_directory: options += ['-I' + srcdir] with open('%s/%s' % (srcdir, name)) as f: source = f.read() return pycuda.compiler.SourceModule(source, options=options, no_extern_c=True) @pytools.memoize def get_cu_source(name): """Get the source code for a CUDA source file located in the chroma cuda directory at src/[name].""" with open('%s/%s' % (srcdir, name)) as f: source = f.read() return source class GPUFuncs(object): """Simple container class for GPU functions as attributes.""" def __init__(self, module): self.module = module self.funcs = {} def __getattr__(self, name): try: return self.funcs[name] except KeyError: f = self.module.get_function(name) self.funcs[name] = f return f init_rng_src = """ #include extern "C" { __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) { int id = blockIdx.x*blockDim.x + threadIdx.x; if (id >= nthreads) return; curand_init(seed, id, offset, &s[id]); } } // extern "C" """ def get_rng_states(size, seed=1): "Return `size` number of CUDA random number generator states." rng_states = cuda.mem_alloc(size*characterize.sizeof('curandStateXORWOW', '#include ')) module = pycuda.compiler.SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') init_rng(np.int32(size), rng_states, np.uint64(seed), np.uint64(0), block=(64,1,1), grid=(size//64+1,1)) return rng_states def to_float3(arr): "Returns an pycuda.gpuarray.vec.float3 array from an (N,3) array." if not arr.flags['C_CONTIGUOUS']: arr = np.asarray(arr, order='c') return arr.astype(np.float32).view(ga.vec.float3)[:,0] def to_uint3(arr): "Returns a pycuda.gpuarray.vec.uint3 array from an (N,3) array." if not arr.flags['C_CONTIGUOUS']: arr = np.asarray(arr, order='c') return arr.astype(np.uint32).view(ga.vec.uint3)[:,0] def chunk_iterator(nelements, nthreads_per_block=64, max_blocks=1024): """Iterator that yields tuples with the values requried to process a long array in multiple kernel passes on the GPU. Each yielded value is of the form, (first_index, elements_this_iteration, nblocks_this_iteration) Example: >>> list(chunk_iterator(300, 32, 2)) [(0, 64, 2), (64, 64, 2), (128, 64, 2), (192, 64, 2), (256, 9, 1)] """ first = 0 while first < nelements: elements_left = nelements - first blocks = int(elements_left // nthreads_per_block) if elements_left % nthreads_per_block != 0: blocks += 1 # Round up only if needed blocks = min(max_blocks, blocks) elements_this_round = min(elements_left, blocks * nthreads_per_block) yield (first, elements_this_round, blocks) first += elements_this_round def create_cuda_context(device_id=None): """Initialize and return a CUDA context on the specified device. If device_id is None, the default device is used.""" if device_id is None: try: context = pycuda.tools.make_default_context() except cuda.LogicError: # initialize cuda cuda.init() context = pycuda.tools.make_default_context() else: try: device = cuda.Device(device_id) except cuda.LogicError: # initialize cuda cuda.init() device = cuda.Device(device_id) context = device.make_context() context.set_cache_config(cuda.func_cache.PREFER_L1) return context vec_dtypes = set([ x for x in ga.vec.__dict__.values() if type(x) == np.dtype ]) def make_gpu_struct(size, members): struct = cuda.mem_alloc(size) i = 0 for member in members: if isinstance(member, ga.GPUArray): member = member.gpudata if isinstance(member, cuda.DeviceAllocation): if i % 8: raise Exception('cannot align 64-bit pointer. ' 'arrange struct member variables in order of ' 'decreasing size.') cuda.memcpy_htod(int(struct)+i, np.asarray(np.intp(int(member)))) i += 8 elif np.isscalar(member) or (hasattr(member, 'dtype') and member.dtype in vec_dtypes and member.shape == ()): cuda.memcpy_htod(int(struct)+i, np.asarray(member)) i += member.nbytes else: raise TypeError('expected a GPU device pointer or scalar type.') return struct def format_size(size): if size < 1e3: return '%.1f%s' % (size, ' ') elif size < 1e6: return '%.1f%s' % (size/1e3, 'K') elif size < 1e9: return '%.1f%s' % (size/1e6, 'M') else: return '%.1f%s' % (size/1e9, 'G') def format_array(name, array): return '%-15s %6s %6s' % \ (name, format_size(len(array)), format_size(array.nbytes)) def Mapped(array): '''Analog to pycuda.driver.InOut(), but indicates this array is memory mapped to the device space and should not be copied. To simplify coding, Mapped() will pass anything with a gpudata member, like a gpuarray, through unchanged. ''' if hasattr(array, 'gpudata'): return array else: return np.intp(array.base.get_device_pointer()) def mapped_alloc(pagelocked_alloc_func, shape, dtype, write_combined): '''Returns a pagelocked host array mapped into the CUDA device address space, with a gpudata field set so it just works with CUDA functions.''' flags = cuda.host_alloc_flags.DEVICEMAP if write_combined: flags |= cuda.host_alloc_flags.WRITECOMBINED array = pagelocked_alloc_func(shape=shape, dtype=dtype, mem_flags=flags) return array def mapped_empty(shape, dtype, write_combined=False): '''See mapped_alloc()''' return mapped_alloc(cuda.pagelocked_empty, shape, dtype, write_combined) def mapped_empty_like(other, write_combined=False): '''See mapped_alloc()''' return mapped_alloc(cuda.pagelocked_empty, other.shape, other.dtype, write_combined) def mapped_zeros(shape, dtype, write_combined=False): '''See mapped_alloc()''' return mapped_alloc(cuda.pagelocked_zeros, shape, dtype, write_combined) def mapped_zeros_like(other, write_combined=False): '''See mapped_alloc()''' return mapped_alloc(cuda.pagelocked_zeros, other.shape, other.dtype, write_combined)