#include "poisson.hh"

#include "cuda_helpers.hh"

#include <cutil_math.h>
#include <cutil_inline.h>
#include <assert.h>

#define POISSON_MASK_FOREGROUND 1
#define POISSON_MASK_BACKGROUND 0

// used for poisson blending
static PixelBufferObject * s_blend = 0;      ///< poisson blended image
static float3 * s_poisson_buffer_0 = 0;      ///< need two work buffers for blending
static float3 * s_poisson_buffer_1 = 0;      ///< need two work buffers for blending
static float3 * s_x_gradients = 0;           ///< gradients in the x-direction
static float3 * s_y_gradients = 0;           ///< gradients in the y-direction
static char * s_mask = 0;                    ///< fg/bg selection mask
static PixelBufferObject * s_x_grad_img = 0; ///< x gradients img (for debugging)
static PixelBufferObject * s_y_grad_img = 0; ///< y gradients img (for debugging)

static const dim3 s_block_size(16, 32, 1);	 ///< block size for computation
static dim3 s_n_blocks(0, 0, 0);             ///< n blocks is proportional to fg size


/// Converts a float3 array to a uchar4 image.
__global__
void arrayToImgKernel(
	float3 * array,    ///< [in]  the input image
	uint2 dim,         ///< [in]  image dimensions
	float3 offset,     ///< [in]  pixel offset amount
	uchar4 * img       ///< [out] x_gradients 
	)
{	
	// compute the x and y pixel locations
	uint2 coord = get_coord();
	CHECK_OUT_OF_BOUNDS(coord, dim);
	
	// read in the data
	unsigned int indx = to_indx(coord, dim);
	img[indx] = make_uchar4(array[indx] + offset);
}


// TODO: write a kernel function that computes the initial gradients
// of the foreground image, in a MAC-style grid




/// Compute the gradients of the foreground image.
void computeGradients(PixelBufferObject & fg)
{
	// create the gradient arrays
	printf("Initializing the gradient arrays.\n");
	const unsigned int ww = fg.getWidth();
	const unsigned int hh = fg.getHeight();
	const unsigned int gradient_buffer_size = ww * hh * sizeof(float3);
	cudaMalloc((void **) &s_x_gradients, gradient_buffer_size);
	cudaMalloc((void **) &s_y_gradients, gradient_buffer_size);
	
	// compute the gradients
	printf("Computing gradients.\n");
	{
		// allocate this object in the {} block to ensure
		// that the map destructor is called when the
		// block ends

		// create a memory map so that CUDA can see the PBO
		PixelBufferObject::MemoryMap map(fg.mapDeviceMemory());
		
		// TODO: call the gradient kernel function to compute the x and y
		// gradients for the initial foreground image
		
	}
	
	// for debugging purposes, look at these as images
	printf("Initializing the gradient image arrays.\n");
	s_x_grad_img = new PixelBufferObject(ww, hh);
	s_y_grad_img = new PixelBufferObject(ww, hh);
	
	// Convert to images for debugging purposes.
	printf("Computing gradient images.\n");
	{
		PixelBufferObject::MemoryMap map(s_x_grad_img->mapDeviceMemory());
		arrayToImgKernel<<<s_n_blocks, s_block_size>>>(
			s_x_gradients, make_uint2(ww, hh), make_float3(128.0f), map.getPointer());
	}
	{
		PixelBufferObject::MemoryMap map(s_y_grad_img->mapDeviceMemory());
		arrayToImgKernel<<<s_n_blocks, s_block_size>>>(
			s_y_gradients, make_uint2(ww, hh), make_float3(128.0f), map.getPointer());
	}	
}

void poissonAllocateMemory(
	PixelBufferObject & fg
	)
{	
	// compute number of blocks
	uint2 size = fg.getSize();
	const unsigned int ww = size.x;
	const unsigned int hh = size.y;
	s_n_blocks = dim3(ww / s_block_size.x, hh / s_block_size.y, 1);

	CHECK_FOR_CUDA_ERROR();
	
	// compute gradients
	computeGradients(fg);
	
	// init the intermediate buffers for working
	cudaMalloc((void **) &s_mask,             ww * hh * sizeof(char));
	cudaMalloc((void **) &s_poisson_buffer_0, ww * hh * sizeof(float3));
	cudaMalloc((void **) &s_poisson_buffer_1, ww * hh * sizeof(float3));

	// init the output blend buffer
	s_blend = new PixelBufferObject(ww, hh);
}

/// Inits the poisson work buffer based on the contents of the images
__global__
void initWorkBuffer(
	uchar4 * bg,         ///< [in]  background image
	uchar4 * fg,         ///< [in]  foreground image
	bool * mask,         ///< [in]  fg/bg selection mask
	uint2 size,          ///< [in]  the array dimensions
	float3 * array       ///< [out] the output array
	)
{
	// compute the output coordinate
	uint2 coord = get_coord();
	CHECK_OUT_OF_BOUNDS(coord, size);
	
	// get the index
	unsigned int indx = to_indx(coord, size);
	
	// write depending on the max
	if (mask[indx]) {
		array[indx] = make_float3(fg[indx]);
	} else {
		array[indx] = make_float3(bg[indx]);		
	}	
}

void poissonInit(
	uchar4 * bg,
	uchar4 * fg,
	bool * mask
	)
{
	uint2 size = s_blend->getSize();
	
	// copy the mask data over
	cudaMemcpy(s_mask, mask, size.x * size.y * sizeof(char),
		cudaMemcpyDeviceToDevice);
	
	// initialize the work buffer, which will contain the current
	// best guess of the Poisson blend
	initWorkBuffer<<<s_n_blocks, s_block_size>>>(
		bg, fg, mask, size, s_poisson_buffer_0);
	
	// copy the data from one work buffer to the other
	cudaMemcpy(s_poisson_buffer_1, s_poisson_buffer_0, 
		size.x * size.y * sizeof(float3), cudaMemcpyDeviceToDevice);
}

// TODO: write kernel function for a single Poisson iteration



/// Iteratively improve by the Poisson blend by running the kernel function
void runPoissonKernel()
{
	uint2 fg_size = s_blend->getSize();

	// TODO: run the Poisson iteration kernel and swap the buffers
	
	

	// now copy data from the Poisson work buffer to the
	// PBO blend buffer to be rendered
	{
		PixelBufferObject::MemoryMap map(s_blend->mapDeviceMemory());
		arrayToImgKernel<<<s_n_blocks, s_block_size>>>(
			s_poisson_buffer_1,
			fg_size,
			make_float3(0.0f),
			map.getPointer());
	}	
}

void poissonDisplay()
{
	runPoissonKernel();

	// draw the blended texture
	const unsigned int ww = s_blend->getWidth();
	const unsigned int hh = s_blend->getHeight();
	s_blend->bindTexture();
	glBegin(GL_QUADS);
	glTexCoord2f(0.0f,0.0f); glVertex2i(0 , 0 );
	glTexCoord2f(1.0f,0.0f); glVertex2i(ww, 0 );
	glTexCoord2f(1.0f,1.0f); glVertex2i(ww, hh);
	glTexCoord2f(0.0f,1.0f); glVertex2i(0 , hh);
	glEnd();
}
