#include <stdio.h>
#include <math.h>

#include "cuda_err.hh"
#include <cutil.h>
#include <cutil_inline.h>


//Simple kernel writes changing colors to a uchar4 array
__global__
void kernel(uchar4* pos, unsigned int width, unsigned int height)
{
	// calculate the unique thread index
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	
	unsigned int x = index%width;
	unsigned int y = index/width;
	
	// TODO: make each thread output a meaningful color
	//       according to the thread pixel's membership 
	//       of the Mandelbrot set
	unsigned char r = int((x / float(width)) * 255.0f);
	unsigned char g = int((y / float(height)) * 255.0f);
	unsigned char b = 128;
	
	// Each thread writes one pixel location in the texture (textel)
	pos[index].x = r;
	pos[index].y = g;
	pos[index].z = b;
	pos[index].w = 0;
}

// Wrapper for the __global__ call that sets up the kernel call
void launch_kernel(uchar4* pos, unsigned int image_width, 
				  unsigned int image_height)
{
	// execute the kernel
	const int threadsPerBlock = 256;
	
	int totalThreads = image_height * image_width;
	int nBlocks = totalThreads / threadsPerBlock; 
	nBlocks += ((totalThreads % threadsPerBlock) > 0) ? 1 : 0;
	
	// launch kernel
	kernel<<<nBlocks, threadsPerBlock>>>(pos, image_width, image_height);

	// make certain the kernel has completed 
	exit_on_err(cudaThreadSynchronize());
}
