diff options
Diffstat (limited to 'test.py')
-rw-r--r-- | test.py | 44 |
1 files changed, 24 insertions, 20 deletions
@@ -6,11 +6,11 @@ from pycuda.compiler import SourceModule import pycuda.driver as cuda from pycuda import gpuarray -def array2float3(arr): +def array2vector(arr, dtype=gpuarray.vec.float3): if len(arr.shape) != 2 or arr.shape[-1] != 3: raise Exception('shape mismatch') - x = np.empty(arr.shape[0], dtype=gpuarray.vec.float3) + x = np.empty(arr.shape[0], dtype=dtype) x['x'] = arr[:,0] x['y'] = arr[:,1] x['z'] = arr[:,2] @@ -23,12 +23,11 @@ source = open('src/linalg.h').read() + open('src/matrix.h').read() + \ open('src/rotate.h').read() + open('src/intersect.cu').read() mod = SourceModule(source, no_extern_c=True, arch='sm_13') + intersect = mod.get_function('intersect_triangle_mesh') rotate = mod.get_function('rotate') translate = mod.get_function('translate') -mesh = array2float3(read_stl('models/tie_interceptor6.stl')) - import pygame size = width, height = 800, 600 screen = pygame.display.set_mode(size) @@ -44,34 +43,39 @@ grid = np.array(grid) grid += (0,focal_length,0) grid += (0,300,0) -x = array2float3(grid) -p = array2float3(((0,300,0)-grid)) +x = array2vector(grid) +x_gpu = cuda.to_device(x) -x_gpu = cuda.mem_alloc(x.nbytes) -cuda.memcpy_htod(x_gpu,x) +p = array2vector(((0,300,0)-grid)) +p_gpu = cuda.to_device(p) -p_gpu = cuda.mem_alloc(p.nbytes) -cuda.memcpy_htod(p_gpu,p) +mesh3 = array2vector(read_stl('models/tie_interceptor6.stl')) -mesh_gpu = cuda.mem_alloc(mesh.nbytes) -cuda.memcpy_htod(mesh_gpu,mesh) +rotate(np.int32(mesh3.size), cuda.InOut(mesh3), np.float32(-np.pi/2), gpuarray.vec.make_float3(1,0,0), block=(256,1,1), grid=(mesh3.size//256+1,1)) -pixel = np.empty(size, dtype=np.int32).flatten() -pixel_gpu = cuda.mem_alloc(pixel.nbytes) -cuda.memcpy_htod(pixel_gpu,pixel) +translate(np.int32(mesh3.size), cuda.InOut(mesh3), gpuarray.vec.make_float3(0,30,0), block=(256,1,1), grid=(mesh3.size//256+1,1)) -rotate(np.int32(mesh.size), mesh_gpu, np.float32(-np.pi/2), gpuarray.vec.make_float3(1,0,0), block=(256,1,1), grid=(mesh.size//256+1,1)) +mesh = np.empty(mesh3.size, dtype=gpuarray.vec.float4) +mesh['x'] = mesh3['x'] +mesh['y'] = mesh3['y'] +mesh['z'] = mesh3['z'] -translate(np.int32(mesh.size), mesh_gpu, gpuarray.vec.make_float3(0,30,0), block=(256,1,1), grid=(mesh.size//256+1,1)) +mesh_gpu = cuda.to_device(mesh) +mesh_tex = mod.get_texref('mesh') +mesh_tex.set_address(mesh_gpu, mesh.nbytes) +mesh_tex.set_format(cuda.array_format.FLOAT, 4) +pixel = np.empty(size, dtype=np.int32).flatten() +pixel_gpu = cuda.to_device(pixel) +block_size = 64 for i in range(100): - rotate(np.int32(x.size), x_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(256,1,1), grid=(width*height//256+1,1)) + rotate(np.int32(x.size), x_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1)) - rotate(np.int32(p.size), p_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(256,1,1), grid=(width*height//256+1,1)) + rotate(np.int32(p.size), p_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1)) t0 = time.time() - intersect(np.int32(x.size), x_gpu, p_gpu, np.int32(mesh.size//3), mesh_gpu, pixel_gpu, block=(64,1,1), grid=(width*height//64+1,1)) + intersect(np.int32(x.size), x_gpu, p_gpu, np.int32(mesh.size//3), pixel_gpu, block=(block_size,1,1), grid=(width*height//block_size+1,1), texrefs=[mesh_tex]) cuda.Context.synchronize() elapsed = time.time() - t0 |