/*----------------------------------------------------------------------
  qpole.c:  Q-learning to solve the cart-pole problem

  Compile with:  cc -o qpole qpole.c -lm

  The routines for simulating the cart and pole dynamic system were
  written by Rich Sutton and Chuck Anderson.  Claude Sammut translated
  parts from Fortran to C.

  The following routines are included:
  
       main:              controls simulation interations and implements 
                          the learning system

       reset_simulator:   resets the cart and pole to a random state
			  
       cart_and_pole:     the cart and pole dynamics; given action and
                          current state, estimates next state

       get_box:           The cart-pole's state space is divided into 162
                          boxes.  get_box returns the index of the box into
                          which the current state is mapped.

       peek_into_policy:  Displays a portion of the policy specified by the
                          current Q-table (as described in hw4.ps)

       ---------------------------------------*/

#include <math.h>

#define min(x, y)               ((x <= y) ? x : y)
#define max(x, y)	        ((x >= y) ? x : y)

#define random01()              ((float) rand() / (float)((1 << 31) - 1))
#define randomab(a,b)           (a + random01() * (b-(a)))

#define N_BOXES         162     /* Number of disjoint boxes of state space. */

/* A successful trial is one that lasts for MAX_STEPS timesteps. */
#define MAX_STEPS       100000
/* Learning is considered done after a # of consecutive successful trials */
#define GOAL_CONSEC     3
/* Or, we give up if we've gone this many trials and not learned the task. */
#define MAX_TRIALS      10000

float Q[N_BOXES][2];            /* table of Q-values */

main()
{
  float x,			/* cart position, meters */
        x_dot,			/* cart velocity */
        theta,			/* pole angle, radians */
        theta_dot;		/* pole angular velocity */

  
  float reward;
  int box, newbox, i, a, steps, trials=0, consecutive_successes=0;

  printf("Random number seed? ");
  scanf("%d",&i);
  srand(i);

  /*--- Initialize Q-table ---*/
  YOUR CODE HERE ;
  
  /*--- Initialize the first trial. ---*/
  reset_simulator(&x, &x_dot, &theta, &theta_dot, &box);
  steps = 0;
  
  /*--- Iterate through the action-learn loop. ---*/
  while (consecutive_successes < GOAL_CONSEC && trials < MAX_TRIALS) {

    /*--- Choose action ---*/
    a = YOUR CODE HERE ;  /* 0 for left or 1 for right */

    /*--- Apply action to the simulated cart-pole ---*/
    cart_pole(a, &x, &x_dot, &theta, &theta_dot);

    /*--- Get box of state space containing the resulting state. ---*/
    newbox = get_box(x, x_dot, theta, theta_dot);

    /*--- Generate the reward signal to be used for reinforcement learning---*/
    if (newbox < 0) {
      reward = -1.0;  /* pole fell, or cart moved out of range */
    }
    else {
      reward = 0.0;   /* cart & pole are ok so far */
    }
      
    /*--- Do Q-learning! ---*/
    YOUR CODE HERE ;

    /*--- Prepare for next simulation step, restarting simulation if nec. ---*/
    if (newbox < 0) {
      /*--- Failure occurred. ---*/
      consecutive_successes = 0;
      trials++;
      printf("Trial %d lasted %d steps.\n", trials, steps);
      steps = 0;
      reset_simulator(&x, &x_dot, &theta, &theta_dot, &box);
    }
    else if (steps == MAX_STEPS) {
      consecutive_successes++;
      trials++;
      printf("Trial %d successfully balanced the pole for %d steps.\n",
	     trials, steps);
      steps = 0;
      reset_simulator(&x, &x_dot, &theta, &theta_dot, &box);
    }      
    else {
      /*--- Not a failure, continue this trial! ---*/
      box = newbox;
      steps++;
    }
  } /* end of while loop */
  if (trials == MAX_TRIALS)
    printf("Pole not balanced. Stopping after %d trials.",trials);
  else
    printf("Pole balanced successfully %d consecutive trials!\n",
	   consecutive_successes);
}


/*----------------------------------------------------------------------
   cart_pole:  Takes an action (0 or 1) and the current values of the
 four state variables and updates their values by estimating the state
 TAU seconds later.
----------------------------------------------------------------------*/

/*** Parameters for simulation ***/

#define GRAVITY 9.8
#define MASSCART 1.0
#define MASSPOLE 0.1
#define TOTAL_MASS (MASSPOLE + MASSCART)
#define LENGTH 0.5		  /* actually half the pole's length */
#define POLEMASS_LENGTH (MASSPOLE * LENGTH)
#define FORCE_MAG 10.0
#define TAU 0.02		  /* seconds between state updates */
#define FOURTHIRDS 1.3333333333333


cart_pole(action, x, x_dot, theta, theta_dot)
int action;
float *x, *x_dot, *theta, *theta_dot;
{
    float xacc,thetaacc,force,costheta,sintheta,temp;

    force = (action>0)? FORCE_MAG : -FORCE_MAG;
    costheta = cos(*theta);
    sintheta = sin(*theta);

    temp = (force + POLEMASS_LENGTH * *theta_dot * *theta_dot * sintheta)
		         / TOTAL_MASS;

    thetaacc = (GRAVITY * sintheta - costheta* temp)
	       / (LENGTH * (FOURTHIRDS - MASSPOLE * costheta * costheta
                                              / TOTAL_MASS));

    xacc  = temp - POLEMASS_LENGTH * thetaacc* costheta / TOTAL_MASS;

/*** Update the four state variables, using Euler's method. ***/

    *x  += TAU * *x_dot;
    *x_dot += TAU * xacc;
    *theta += TAU * *theta_dot;
    *theta_dot += TAU * thetaacc;
}


/*----------------------------------------------------------------------
   get_box:  Given the current state, returns a number from 0 to 161
  designating the region of the state space encompassing the current state.
  Returns a value of -1 if a failure state is encountered.
----------------------------------------------------------------------*/

#define one_degree 0.0174532	/* 2pi/360 */
#define six_degrees 0.1047192
#define twelve_degrees 0.2094384
#define fifty_degrees 0.87266

get_box(x,x_dot,theta,theta_dot)
float x,x_dot,theta,theta_dot;
{
  int box=0;

  if (x < -2.4 ||  
      x > 2.4  ||  
      theta < -twelve_degrees ||
      theta > twelve_degrees)          return(-1); /* to signal failure */

  if (x < -0.8)  		       box = 0;
  else if (x < 0.8)     	       box = 1;
  else		    	               box = 2;

  if (x_dot < -0.5) 		       ;
  else if (x_dot < 0.5)                box += 3;
  else 			               box += 6;

  if (theta < -six_degrees) 	       ;
  else if (theta < -one_degree)        box += 9;
  else if (theta < 0) 		       box += 18;
  else if (theta < one_degree) 	       box += 27;
  else if (theta < six_degrees)        box += 36;
  else	    			       box += 45;

  if (theta_dot < -fifty_degrees) 	;
  else if (theta_dot < fifty_degrees)  box += 54;
  else                                 box += 108;

  return(box);
}

/*----------------------------------------------------------------------
   reset_simulator:  Sets the cart and pole to a random state.
   Also calls get_box to set the box # of this new state.
----------------------------------------------------------------------*/
reset_simulator(x, x_dot, theta, theta_dot, box)
float *x, *x_dot, *theta, *theta_dot;
int *box;
{
  *x = randomab(-0.8,0.8);
  *x_dot = randomab(-0.5,0.5);
  *theta = randomab(-six_degrees,six_degrees);
  *theta_dot = randomab(-0.87,0.87);

  *box = get_box(*x, *x_dot, *theta, *theta_dot);
}


#define Winner(i) ((Q[i][0] > Q[i][1]) ? '0' : '1')

peek_into_policy() {
  printf("0,- [%c%c%c%c%c%c %c%c%c%c%c%c %c%c%c%c%c%c]       +2,===> [%c%c%c%c%c%c %c%c%c%c%c%c %c%c%c%c%c%c]\n",

    Winner(4),Winner(13),Winner(22),Winner(31),Winner(40),Winner(49),
    Winner(58),Winner(67),Winner(76),Winner(85),Winner(94),Winner(103),
    Winner(112),Winner(121),Winner(130),Winner(139),Winner(148),Winner(159),

    Winner(8),Winner(17),Winner(26),Winner(35),Winner(44),Winner(53),
    Winner(62),Winner(71),Winner(80),Winner(89),Winner(98),Winner(107),
    Winner(116),Winner(125),Winner(134),Winner(143),Winner(152),Winner(161) );
}
