/* rl.h
 * CMUnited-97 (soccer client for Robocup-97)
 * Peter Stone <pstone@cs.cmu.edu>
 * Computer Science Department
 * Carnegie Mellon University
 * Copyright (C) 1997 Peter Stone
 *
 * CMUnited-97 was created by Peter Stone and Manuela Veloso
 *
 * You may copy and distribute this program freely as long as you retain this notice.
 * If you make any changes or have any comments we would appreciate a message.
 */

#ifndef _RL_H_
#define _RL_H_

#include "position.h"

#define USE_RL 1
#define QMAX 100
/* max time before closing rewards */
#define MAX_REWARD_TIME 300
/* confidence of knock/dribble/clear action from knockorpass.c */
#define KNOCK_CONF 0

/****************************************************************************/

/* FutureValueInterval is meant to store an interval of values from 
   the receiver and the associated QValue.  The 2nd dimension deep 
   in the 2-dim. q-value function.  ActionValueInterval is the top level 
   and points to a list of FutureValueIntervals (disjoint covering)
*/
class FutureValueInterval{
public:
   FutureValueInterval(float min, float max, float q);
   ~FutureValueInterval();

  /* Insert a new entry by min key returns a pointer to the new entry */
  FutureValueInterval *Insert(float min);     

  void  UpdateQValue(float ActionConf, float FutureValue, float Reward);
  float GetQValue(float FutureValue);

  void  Print(FILE *oStream);
  void  Load (FILE *oStream);

  float FutureMin;
  float FutureMax;
  FutureValueInterval *Next;
  int   num;  /* Number of times executed (with reward received) */
  float weight;  /* Amount of weight in weighted average of Qvalues */
private:
  float QValue;
};

/****************************************************************************/

#define MIN_ACTION_VALUE -1
#define MAX_ACTION_VALUE 1

/* ActionValueInterval should also be a disjoint covering.  Each entry 
   points to a FutureValueInterval list with the associated Q values
*/
class ActionValueInterval{
public:
   ActionValueInterval(float min, float max, float q);
   ~ActionValueInterval();

  /* Insert a new entry by min key returns a pointer to the new entry */
  ActionValueInterval *Insert(float min, float q);     

  void  UpdateQValue(float ActionValue, float ActionConf, float FutureValue, float Reward);
  float GetQValue(float ActionValue, float FutureValue);
  int   GetNum();  /* Total number of examples in interval */

  void  Print(FILE *oStream);
  void  Load (FILE *oStream);
  void  Load (FILE *oStream,float min,float max);

  float ActionMin;
  float ActionMax;
  ActionValueInterval *Next;
private:
  FutureValueInterval *FutureValueList;
};

/****************************************************************************/

/* Head[i] points to an ordered disjoint covering of the interval [-1,1)
   which represents the range of possible DT outputs.  For each range
   (possibly treated as single values), there is another disjoint covering
   of the possible future values that might have been communicated back 
   from the player playing position i.  In the simplest case, it will be a 
   single interval holding the Q value for the DT confidence.
*/

/* Players can pass to any player or knock to any receiver */
#define RL_PASS_ACTION  0
#define NUM_RL_PASS_ACTIONS TEAM_SIZE
#define RL_KNOCK_ACTION 1
#define NUM_RL_KNOCK_ACTIONS NUM_EDGE_MARKERS
#define NUM_RL_ACTIONS (NUM_RL_PASS_ACTIONS + NUM_RL_KNOCK_ACTIONS)

class QTable{
public:
  QTable(int form, int pos);
  ~QTable();

  /* Update current entry, or add new entry */
  void  UpdateQTable(int position, float FeatureVal, float DTConf, float FutureValue, float Reward);
  float GetQValue(int position, float DTConf, float FutureValue);
  int   GetNum(int action);
  
  void  Write(FILE *oStream);
  inline void  Print() { Write(stdout); }
  int  Load (FILE *oStream);  

  inline int IsLoaded() { return Loaded; }

  int                 Formation;
  int                 Position;
  char                dataFileName[30];
  int                 MightExist;
private:
  int                 NumActions;
  int                 Loaded;
  ActionValueInterval *Head[NUM_RL_ACTIONS];
};

/****************************************************************************/

class RewardInfo{
public:
  RewardInfo();
  ~RewardInfo();
  
  QTable *GetMyQTable();
  QTable *GetQTable(int formation, int position);

  void   SetActionState(int to, float val, float conf, float future);
  void   CloseRewards();
  void   LookForRewards();

  int    QActionTaken;    /* TRUE/FALSE -- look for rewards? */
  int    KeepLearning;
private:
  QTable           *QTables[NUM_FORMATIONS][TEAM_SIZE];

  int    QLastFormation;  /* Formation we were in            */
  int    QLastActionFrom; /* Position I was playing          */
  int    QLastActionTo;   /* Action Taken                    */
  float  QLastActionVal;  /* Feature value for that action   */
  float  QLastActionConf; /* Confidence value for that action*/
  float  QLastFutureVal;  /* Future returned from receiver   */

  int   MyScore;     /* Values when the action is taken */
  int   TheirScore;
  float BallX,BallY;
  int   Time;

  float AvgBallX;
  int   AvgBallUpdateTime;
};

/****************************************************************************/

int RLforReceiver(int NumActions, int *actions, int *action_types, float *Confidences);

#endif
