import time
from stl import *
import numpy as np
from pycuda import autoinit
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
from pycuda import gpuarray
from string import Template

def array2vector(arr, dtype=gpuarray.vec.float3):
    if len(arr.shape) != 2 or arr.shape[-1] != 3:
        raise Exception('shape mismatch')

    x = np.empty(arr.shape[0], dtype=dtype)
    x['x'] = arr[:,0]
    x['y'] = arr[:,1]
    x['z'] = arr[:,2]

    return x

print 'device %s' % autoinit.device.name()

source = open('src/intersect.cu').read()
mod = SourceModule(source, options=['-I /home/tlatorre/projects/chroma/src'], no_extern_c=True, arch='sm_13')

rotate = mod.get_function('rotate')
translate = mod.get_function('translate')
intersect_mesh = mod.get_function('intersect_mesh')

import pygame
size = width, height = 800, 600
screen = pygame.display.set_mode(size, (pygame.NOFRAME | pygame.DOUBLEBUF))

film_size = (0.035, 0.024)
focal_length = 0.05

grid = []
for x in np.linspace(-film_size[0]/2, film_size[0]/2, width):
    for z in np.linspace(-film_size[1]/2, film_size[1]/2, height):
        grid.append((x,0,z))
grid = np.array(grid)
grid += (0,focal_length,0)
grid += (0,200,30)

x = array2vector(grid)
x_gpu = cuda.to_device(x)

p = (0,200,30)-grid

for i in range(p.shape[0]):
    p[i] /= np.linalg.norm(p[i])

p = array2vector(p)
p_gpu = cuda.to_device(p)


from zcurve import *

#mesh = read_stl('models/tie_interceptor6.stl')
#mesh = read_stl('models/sphere.stl')
#mesh = read_stl('models/IV.stl')

#from lbne import build_lbne
#mesh = build_lbne()

mesh = read_stl('models/lbne_sphere_only.stl')

mesh = mesh.reshape(mesh.shape[0]//3,3,3)
mesh = morton_order(mesh)
mesh = mesh.reshape(mesh.shape[0]*3, 3)

mesh3 = array2vector(mesh)

from build import Graph


#rotate(np.int32(mesh3.size), cuda.InOut(mesh3), np.float32(-np.pi/2), gpuarray.vec.make_float3(1,0,0), block=(256,1,1), grid=(mesh3.size//256+1,1))

#translate(np.int32(mesh3.size), cuda.InOut(mesh3), gpuarray.vec.make_float3(0,30,0), block=(256,1,1), grid=(mesh3.size//256+1,1))

graph = Graph(mesh3)

lower = array2vector(graph.lower, dtype=gpuarray.vec.float4)
upper = array2vector(graph.upper, dtype=gpuarray.vec.float4)
start = graph.start.astype(np.uint32)
count = graph.count.astype(np.uint32)
stack = np.zeros(lower.size, dtype=np.int32)

lower_gpu = cuda.to_device(lower)
upper_gpu = cuda.to_device(upper)

lower_tex = mod.get_texref('lower_bound_arr')
upper_tex = mod.get_texref('upper_bound_arr')

lower_tex.set_address(lower_gpu, lower.nbytes)
upper_tex.set_address(upper_gpu, upper.nbytes)

lower_tex.set_format(cuda.array_format.FLOAT, 4)
upper_tex.set_format(cuda.array_format.FLOAT, 4)

start_gpu = cuda.to_device(start)
count_gpu = cuda.to_device(count)
stack_gpu = cuda.mem_alloc(stack.nbytes)
cuda.memcpy_htod(stack_gpu, stack)

child_map_tex = mod.get_texref('child_map_arr')
child_len_tex = mod.get_texref('child_len_arr')

child_map_tex.set_address(start_gpu, start.nbytes)
child_len_tex.set_address(count_gpu, count.nbytes)

child_map_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)
child_len_tex.set_format(cuda.array_format.UNSIGNED_INT32, 1)

mesh = np.empty(mesh3.size, dtype=gpuarray.vec.float4)
mesh['x'] = mesh3['x']
mesh['y'] = mesh3['y']
mesh['z'] = mesh3['z']

mesh_gpu = cuda.to_device(mesh)
mesh_tex = mod.get_texref('mesh')
mesh_tex.set_address(mesh_gpu, mesh.nbytes)
mesh_tex.set_format(cuda.array_format.FLOAT, 4)

pixel = np.empty(size, dtype=np.int32).flatten()

pixel_gpu = cuda.mem_alloc(pixel.nbytes)
cuda.memcpy_htod(pixel_gpu, pixel)

speed = []
elapsed = []

t0total = time.time()

block_size = 64
for i in range(1000):
    rotate(np.int32(x.size), x_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1))

    rotate(np.int32(p.size), p_gpu, np.float32(np.pi/100), gpuarray.vec.make_float3(0,0,1), block=(block_size,1,1), grid=(width*height//block_size+1,1))

    translate(np.int32(x.size), x_gpu,
              gpuarray.vec.make_float3(-np.sin(i*np.pi/100),-np.cos(i*np.pi/100),0),
              block=(block_size,1,1), grid=(width*height//block_size+1,1))

    t0 = time.time()
    intersect_mesh(np.int32(x.size), x_gpu, p_gpu, pixel_gpu, np.int32(graph.first_leaf), block=(block_size,1,1), grid=(width*height//block_size+1,1), texrefs=[mesh_tex, upper_tex, lower_tex, child_map_tex, child_len_tex])
    cuda.Context.synchronize()

    elapsed.append(time.time() - t0)

    print '%i triangles, %i photons, %f sec; (%f photons/sec)' % \
        (mesh.size//3, pixel.size, elapsed[-1], pixel.size/elapsed[-1])

    speed.append(pixel.size/elapsed[-1])

    cuda.memcpy_dtoh(pixel, pixel_gpu)
    pygame.surfarray.blit_array(screen, pixel.reshape(size))
    pygame.display.flip()

print 'average time = %f sec' % np.mean(elapsed)
print 'average speed = %f photons/sec' % np.mean(speed)
print 'total time = %f sec' % (time.time() - t0total)

raw_input('press enter to exit')