3 files changed, 35 insertions, 25 deletions
diff --git a/src/intersect.cu b/src/intersect.cu
index 0bb73a0..c78350e 100644
--- a/src/intersect.cu
+++ b/src/intersect.cu
@@ -1,5 +1,7 @@
 //-*-c-*-
 
+texture<float4, 1, cudaReadModeElementType> mesh;
+
 __device__ bool intersect_triangle(const float3 &x, const float3 &p, const float3 &v0, const float3 &v1, const float3 &v2, float3 &intersection)
 {
 	Matrix m = make_matrix(v1-v0, v2-v0, -p);
@@ -45,6 +47,11 @@ __device__ int get_color(const float3 &p, const float3 &v0, const float3& v1, co
 	return rgb*65536 + rgb*256 + rgb;
 }
 
+__device__ float3 make_float3(const float4 &a)
+{
+	return make_float3(a.x, a.y, a.z);
+}
+
 extern "C"
 {
 
@@ -68,7 +75,7 @@ __global__ void rotate(int max_idx, float3 *x, float phi, float3 axis)
 	x[idx] = rotate(x[idx], phi, axis);
 }
 
-__global__ void intersect_triangle_mesh(int max_idx, float3 *xarr, float3 *parr, int n, float3* mesh, int *pixelarr)
+__global__ void intersect_triangle_mesh(int max_idx, float3 *xarr, float3 *parr, int n, int *pixelarr)
 {
 	int idx = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -87,9 +94,9 @@ __global__ void intersect_triangle_mesh(int max_idx, float3 *xarr, float3 *parr,
 	int i;
 	for (i=0; i < n; i++)
 	{
-		float3 v0 = *(mesh+3*i);
-		float3 v1 = *(mesh+3*i+1);
-		float3 v2 = *(mesh+3*i+2);
+		float3 v0 = make_float3(tex1Dfetch(mesh, 3*i));
+		float3 v1 = make_float3(tex1Dfetch(mesh, 3*i+1));
+		float3 v2 = make_float3(tex1Dfetch(mesh, 3*i+2));
 			
 		if (intersect_triangle(x, p, v0, v1, v2, intersection))
 		{
diff --git a/src/rotate.h b/src/rotate.h
index 52d6d6a..fec76a8 100644
--- a/src/rotate.h
+++ b/src/rotate.h
@@ -2,7 +2,6 @@
 #define __ROTATE_H__
 
 __device__ const Matrix IDENTITY_MATRIX = {1,0,0,0,1,0,0,0,1};
-__device__ const Matrix ZERO_MATRIX = {0,0,0,0,0,0,0,0,0};
 
 __device__ __host__ Matrix make_rotation_matrix(float phi, const float3 &n)
 {
diff --git a/test.py b/test.py
index 267b266..4390c50 100644
--- a/test.py
+++ b/test.py
@@ -6,11 +6,11 @@ from pycuda.compiler import SourceModule
 import pycuda.driver as cuda
 from pycuda import gpuarray
 
-def array2float3(arr):
+def array2vector(arr, dtype=gpuarray.vec.float3):
     if len(arr.shape) != 2 or arr.shape[-1] != 3:
         raise Exception('shape mismatch')
 
-    x = np.empty(arr.shape[0], dtype=gpuarray.vec.float3)
+    x = np.empty(arr.shape[0], dtype=dtype)
     x['x'] = arr[:,0]
     x['y'] = arr[:,1]
     x['z'] = arr[:,2]
@@ -23,12 +23,11 @@ source = open('src/linalg.h').read() + open('src/matrix.h').read() + \
     open('src/rotate.h').read() + open('src/intersect.cu').read()
 
 mod = SourceModule(source, no_extern_c=True, arch='sm_13')
+
 intersect = mod.get_function('intersect_triangle_mesh')
 rotate = mod.get_function('rotate')
 translate = mod.get_function('translate')
 
-mesh = array2float3(read_stl('models/tie_interceptor6.stl'))
-
 import pygame
 size = width, height = 800, 600
 screen = pygame.display.set_mode(size)
@@ -44,34 +43,39 @@ grid = np.array(grid)
 grid += (0,focal_length,0)
 grid += (0,300,0)
 
-x = array2float3(grid)
-p = array2float3(((0,300,0)-grid))
+x = array2vector(grid)
+x_gpu = cuda.to_device(x)
 
-x_gpu = cuda.mem_alloc(x.nbytes)
-cuda.memcpy_htod(x_gpu,x)
+p = array2vector(((0,300,0)-grid))
+p_gpu = cuda.to_device(p)
 
-p_gpu = cuda.mem_alloc(p.nbytes)
-cuda.memcpy_htod(p_gpu,p)
+mesh3 = array2vector(read_stl('models/tie_interceptor6.stl'))
 
-mesh_gpu = cuda.mem_alloc(mesh.nbytes)
-cuda.memcpy_htod(mesh_gpu,mesh)
+rotate(np.int32(mesh3.size), cuda.InOut(mesh3), np.float32(-np.pi/2), gpuarray.vec.make_float3(1,0,0), block=(256,1,1), grid=(mesh3.size//256+1,1))
 
-pixel = np.empty(size, dtype=np.int32).flatten()
-pixel_gpu = cuda.mem_alloc(pixel.nbytes)
-cuda.memcpy_htod(pixel_gpu,pixel)
+translate(np.int32(mesh3.size), cuda.InOut(mesh3), gpuarray.vec.make_float3(0,30,0), block=(256,1,1), grid=(mesh3.size//256+1,1))
 
-rotate(np.int32(mesh.size), mesh_gpu, np.float32(-np.pi/2), gpuarray.vec.make_float3(1,0,0), block=(256,1,1), grid=(mesh.size//256+1,1))
+mesh = np.empty(mesh3.size, dtype=gpuarray.vec.float4)
+mesh['x'] = mesh3['x']
+mesh['y'] = mesh3['y']
+mesh['z'] = mesh3['z']
 
-translate(np.int32(mesh.size), mesh_gpu, gpuarray.vec.make_float3(0,30,0), block=(256,1,1), grid=(mesh.size//256+1,1))
+mesh_gpu = cuda.to_device(mesh)
+mesh_tex = mod.get_texref('mesh')
+mesh_tex.set_address(mesh_gpu, mesh.nbytes)
+mesh_tex.set_format(cuda.array_format.FLOAT, 4)
 
+pixel = np.empty(size, dtype=np.int32).flatten()
+pixel_gpu = cuda.to_device(pixel)
 
+block_size = 64
 for i in range(100):
-    rotate(np.int32(x.size), x_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(256,1,1), grid=(width*height//256+1,1))
+    rotate(np.int32(x.size), x_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1))
 
-    rotate(np.int32(p.size), p_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(256,1,1), grid=(width*height//256+1,1))
+    rotate(np.int32(p.size), p_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1))
 
     t0 = time.time()
-    intersect(np.int32(x.size), x_gpu, p_gpu, np.int32(mesh.size//3), mesh_gpu, pixel_gpu, block=(64,1,1), grid=(width*height//64+1,1))
+    intersect(np.int32(x.size), x_gpu, p_gpu, np.int32(mesh.size//3), pixel_gpu, block=(block_size,1,1), grid=(width*height//block_size+1,1), texrefs=[mesh_tex])
     cuda.Context.synchronize()
 
     elapsed = time.time() - t0