/* rl.c
 * CMUnited-97 (soccer client for Robocup-97)
 * Peter Stone <pstone@cs.cmu.edu>
 * Computer Science Department
 * Carnegie Mellon University
 * Copyright (C) 1997 Peter Stone
 *
 * CMUnited-97 was created by Peter Stone and Manuela Veloso
 *
 * You may copy and distribute this program freely as long as you retain this notice.
 * If you make any changes or have any comments we would appreciate a message.
 */

/* -*- Mode: C -*- */
#include "global.h"

#define DEFAULT_FUTURE 0
#define ONLY_KNOCKS 1
/* for only passes, need to inhibit knocks in knockorpass.c */
#define ONLY_PASSES 0
#define POS_5_LEARN 0
#define GOAL_ONLY_REWARDS 0
#define TRUST_CONF 0
#define BREAK_TIES_MAXQ 1
#define DT_SUCCESS_CUTOFF .734

/* 
 USE RL on teammates with confidences 

 not like real RL--don't get to keep acting after your action
 so it's really MA RL -- put the TEAM in a new state
 reinforcement comes indirectly 
 
 Assumes confidences of other actions are independent of success of this one
 Action (pass to position), conf.  ==> value 
 value depends on success of action and subsequent actions
 so mixes success, value, safety

 could add:  players, w/ DT info, give own estimate of their best value
 ==> 2 dim. state.  Train first dim. first?
*/

FutureValueInterval::FutureValueInterval(float min, float max, float q){
  FutureMin = min;
  FutureMax = max;
  QValue    = q;
  if (q == 0){
    num     = 0;
    weight = 0;
  }
  else {
    num     = 1;
    weight = 1;
  }
  Next      = NULL;
}

FutureValueInterval::~FutureValueInterval(){
  if (Next != NULL){
    Next->~FutureValueInterval();
    delete Next;
  }
}

FutureValueInterval *FutureValueInterval::Insert(float min){
  if (min < FutureMax){
    FutureValueInterval *NewInterval = new FutureValueInterval(min,FutureMax,QValue);
    FutureMax = min; /* Shrink the current interval */
    NewInterval->Next = Next;
    Next = NewInterval;
    return NewInterval;
  }
  else if (Next == NULL)
    my_error("Should be able to insert");
  else
    Next->Insert(min);
}

void  FutureValueInterval::UpdateQValue(float ActionConf, float FutureValue, float Reward){
  /* split interval criterion???? */

  if (FutureValue >= FutureMin && FutureValue < FutureMax){
    /* update Q value in noisy environment -- take into account last rewards 
       like Kalman in a way --- can't just be equal to latest reward (sum also bad)*/
    //printf("%d reward = %.1f\n",Mem->GetMyPosition(),Reward);

    /* Latest counts for at least 5% of reward -- shifting concept      */
    /* Makes first 20 count fo 35% of total after next 20, 12% after 40 */
#if TRUST_CONF
    /*printf("%d:%d %.1f %.1f %.1f %.1f ..... ",Mem->MyNumber,Mem->CurrentTime,
	   QValue,Reward,weight,(1+ActionConf)/2);*/
/*    if (weight >= 15)
      QValue = weighted_avg(QValue,Reward,14,
			    (ActionConf - MIN_ACTION_VALUE)/
			    (MAX_ACTION_VALUE - MIN_ACTION_VALUE));
    else
      QValue = weighted_avg(QValue,Reward,weight,(ActionConf - MIN_ACTION_VALUE)/2);*/
    //Reward *= 2/(ActionConf+1);
    //Reward -= QMAX;
    Reward /= QMAX; 
    Reward += 1; /* Now ranges from 0-2 */
    float Q = Reward/((ActionConf+1)/2 + 1);  /* so ranges from 1-2 */
    //float Q = Reward/((ActionConf+1)/4 + 1);
    /* Q is now the multiplier that should have been used to get the actual 
       reward.  Ranges from 0-2 */
    if (num >= 20)
      QValue = weighted_avg(QValue,Q,19,1);
    else
      QValue = weighted_avg(QValue,Q,num,1);
    /* printf("%d:%d %.1f\n",Mem->MyNumber,Mem->CurrentTime,QValue); */
    weight += (ActionConf - MIN_ACTION_VALUE)/2;
#else
    if (num >= 50)
      QValue = weighted_avg(QValue,Reward,49,1);
    else
      QValue = weighted_avg(QValue,Reward,num,1);
#endif

    num++;
  }
  else if (Next == NULL)
    my_error("Can't fit FutureValue into an interval (UpdateQValue)");
  else
    Next->UpdateQValue(ActionConf,FutureValue,Reward);
}

float FutureValueInterval::GetQValue(float FutureValue){

  if (FutureValue >= FutureMin && FutureValue < FutureMax){
    return QValue;
  }
  else if (Next == NULL)
    my_error("Can't fit FutureValue into an interval (GetQValue)");
  else
    Next->GetQValue(FutureValue);
}

void FutureValueInterval::Print(FILE *oStream){
  fprintf(oStream,"{[%.1f, %.1f) : %.3f (%d %.1f)}",FutureMin,FutureMax,QValue,num,weight);
  if (Next != NULL){
    fprintf(oStream,", ");
    Next->Print(oStream);
  }
  else 
    fprintf(oStream,".\n");
}

void FutureValueInterval::Load(FILE *oStream){
  float min, max, q, w;
  int n;
  fscanf(oStream,"{[%f, %f) : %f (%d %f)}",&min,&max,&q,&n,&w);

  QValue = q;
  num = n;
  weight = w;
  if ( max != FutureMax )
    Insert(max);
  int chr1;
  if ( (char)(chr1 = getc(oStream)) != '.' ){
    ungetc(chr1,oStream);
    Next->Load(oStream);
  }
  else 
    fscanf(oStream,"\n");
}

/****************************************************************************/

ActionValueInterval::ActionValueInterval(float min, float max, float q){
  ActionMin = min;
  ActionMax = max;
  FutureValueList = new FutureValueInterval(-QMAX,QMAX,q);
  Next      = NULL;
  if (min == MIN_ACTION_VALUE && max == MAX_ACTION_VALUE)
    for (float i=min+.1; i<max; i+=.1)
      Insert(i,q);
}

ActionValueInterval::~ActionValueInterval(){
  if (Next != NULL){
    delete Next;
  }
  delete FutureValueList;
}

ActionValueInterval *ActionValueInterval::Insert(float min, float q){
  if (min < ActionMax){
    ActionValueInterval *NewInterval = new ActionValueInterval(min,ActionMax,q);
    ActionMax = min; /* Shrink the current interval */
    NewInterval->Next = Next;
    Next = NewInterval;
    return NewInterval;
  }
  else if (Next == NULL)
    my_error("Should be able to insert");
  else
    Next->Insert(min,q);
}

void  ActionValueInterval::UpdateQValue(float ActionValue, float ActionConf, float FutureValue, float Reward){

  /* split interval criterion?  For now, one interval per value */

  if (ActionValue < ActionMin+.1 && ActionValue < ActionMax){ 
  /* if (ActionValue == ActionMin){ */
    FutureValueList->UpdateQValue(ActionConf, FutureValue, Reward);
  }
  else if (ActionValue < ActionMax){  /* Seen values always interval bottoms */
    Insert(ActionValue,Reward);
  }
  else if (Next == NULL){
    if (ActionValue == MAX_ACTION_VALUE)
      FutureValueList->UpdateQValue(ActionConf, FutureValue, Reward);
    else
      my_error("Can't fit ActionValue into an interval (UpdateQValue)");
  }
  else
    Next->UpdateQValue(ActionValue,ActionConf,FutureValue,Reward);
}

float ActionValueInterval::GetQValue(float ActionValue, float FutureValue){

  if (ActionValue >= ActionMin && ActionValue < ActionMax){
    return FutureValueList->GetQValue(FutureValue);
  }
  else if (Next == NULL){
    if (ActionValue == MAX_ACTION_VALUE)
      return FutureValueList->GetQValue(FutureValue);
    else{
      char tmp[100];
      sprintf(tmp,"Can't fit ActionValue %.3f into an interval (GetQValue): last (%.1f %.1f)",
	      ActionValue,ActionMin,ActionMax);
      my_error(tmp);
    }
  }
  else
    Next->GetQValue(ActionValue,FutureValue);
}

int ActionValueInterval::GetNum(){
  int num = 0;
  FutureValueInterval *FVL = FutureValueList;
  while (FVL != NULL){
    num += FVL->num;
    FVL = FVL->Next;
  }
  return num;
}

void ActionValueInterval::Print(FILE *oStream){
  if ( GetNum() ){  /* Don't print empty intervals */
    fprintf(oStream,"[%.3f, %.3f) ::: ",ActionMin,ActionMax);
    FutureValueList->Print(oStream);
  }
  if (Next != NULL){
    Next->Print(oStream);
  }
  else 
    fprintf(oStream,".\n");
}

void ActionValueInterval::Load(FILE *oStream){
  float min, max;
  fscanf(oStream,"[%f, %f) ::: ",&min,&max);
  Load(oStream,min,max);
}

void ActionValueInterval::Load(FILE *oStream,float min,float max){

  if ( min + .00001 >= ActionMax ){
    Next->Load(oStream,min,max);
    return;
  }
  else if ( min > ActionMin ){
    Insert(min,0);
    Next->Load(oStream,min,max);
    return;
  }
  else if ( (int)(max*10000) != (int)(ActionMax*10000) )
    Insert(max,0);

  if ( fabs(min-ActionMin)>.00001 || fabs(max-ActionMax)>.00001 ) {
    printf("(2) %f min, %f ActionMin, %f max, %f ActionMax\n",min,ActionMin,max,ActionMax);
    my_error("Mins and maxs should line up at this point");
  }

  FutureValueList->Load(oStream);
  int chr1;
  if ( (char)(chr1 = getc(oStream)) != '.' ){
    ungetc(chr1,oStream);
    Next->Load(oStream);
  }
  else 
    fscanf(oStream,"\n");
}

/****************************************************************************/

QTable::QTable(int form, int pos){
  NumActions = NUM_RL_ACTIONS;
  MightExist= TRUE;
  Loaded    = FALSE;
  Formation = form;
  Position  = pos;
  for (int i=0; i<NumActions; i++)
    Head[i] = new ActionValueInterval(MIN_ACTION_VALUE,MAX_ACTION_VALUE,0);
  sprintf(dataFileName,"rlDat/rlDat%d-%d.dat",form,pos);
}

QTable::~QTable(){
  for (int i=0; i<NumActions; i++)
    delete Head[i];
}

void  QTable::UpdateQTable(int position, float FeatureVal, float DTConf, float FutureValue, float Reward){
  Head[position]->UpdateQValue(FeatureVal,DTConf,FutureValue,Reward);
}

float QTable::GetQValue(int position, float DTConf, float FutureValue){
  return Head[position]->GetQValue(DTConf,FutureValue);
}

int QTable::GetNum(int action){
  int num = 0;
  ActionValueInterval *AVI = Head[action];
  while (AVI != NULL){
    num += AVI->GetNum();
    AVI = AVI->Next;
  }
  return num;
}

void QTable::Write(FILE *oStream){
  fprintf(oStream,"Formation %d\nPosition %d\n\n",Formation,Position);
  for (int i=0; i<NumActions; i++){
    if ( GetNum(i) || i == NUM_RL_ACTIONS-1 ){
      fprintf(oStream,"%2d %2d %2d %2d %2d %2d %2d %2d ",i,i,i,i,i,i,i,i);
      fprintf(oStream,"%2d %2d %2d %2d %2d %2d %2d %2d\n",i,i,i,i,i,i,i,i);
      Head[i]->Print(oStream);
    }
  }
}

int QTable::Load(FILE *oStream){

  int form,pos;
  if ( fscanf(oStream,"Formation %d\nPosition %d\n\n",&form,&pos) == EOF ){
    /* printf(".");
       fflush(stdout); */
    return FALSE;
  }

  if (Formation != form || Position  != pos ){
    printf("loading wrong form/pos: got Formation %d Position %d\n",form,pos);
    return FALSE;
  }
  
  int to=0;
  char junk[100];
  while ( to < NUM_RL_ACTIONS-1 ){
    fscanf(oStream,"%2d %[^\n]\n",&to,junk);
    /* if (to != i) my_error("not loading 'to' correctly"); */
    int chr1;
    if ( (char)(chr1 = getc(oStream)) != '.' ){
      ungetc(chr1,oStream);
      delete Head[to];
      Head[to] = new ActionValueInterval(MIN_ACTION_VALUE,MAX_ACTION_VALUE,0);
      Head[to]->Load(oStream);
    }
  }

  Loaded = TRUE;
  return TRUE;
}

/****************************************************************************/

RewardInfo::RewardInfo(){
  for (int i=0; i<NUM_FORMATIONS; i++)
    for (int j=0; j<TEAM_SIZE; j++)
      QTables[i][j] = new QTable(i,j);
  QActionTaken    = FALSE;
  KeepLearning    = FALSE;
  QLastFormation  = UNKNOWN_FORMATION_TYPE;
  QLastActionFrom = UNKNOWN_POSITION;
  QLastActionTo   = UNKNOWN_POSITION;
  QLastActionVal = 0;
  QLastActionConf = 0;
  QLastFutureVal  = 0;
}

RewardInfo::~RewardInfo(){
  for (int i=0; i<NUM_FORMATIONS; i++)
    for (int j=0; j<TEAM_SIZE; j++)
      delete QTables[i][j];
}

QTable *RewardInfo::GetMyQTable(){
  return GetQTable(Mem->GetCurrentFormationType(),
		   Mem->GetPositionOfMyLocation());  
                   /* Mem->GetMyPosition()); */
}

QTable *RewardInfo::GetQTable(int formation, int position){
  
  QTable *TheQTable = QTables[formation][position];

  if ( Mem->KeepLearning || (!TheQTable->IsLoaded() && TheQTable->MightExist) ){
    /* If not loading, shouldn't do this every time */
    FILE *rlFile = fopen(TheQTable->dataFileName,"r");
    if ( rlFile == NULL )
      TheQTable->MightExist=FALSE;
    else {
      /* printf("%d Loading from %s\n",Mem->MyNumber,TheQTable->dataFileName); */
      while ( TheQTable->Load(rlFile) == FALSE ){
	fclose(rlFile);
	rlFile = fopen(TheQTable->dataFileName,"r");
      }
      /* TheQTable->Write(stdout);
      exit(0); */
    }
    fclose(rlFile);
  }
  return TheQTable;
}

void RewardInfo::SetActionState(int to, float val, float conf, float future){

  if ( QActionTaken && Mem->CurrentTime - Time > 20 )
    /* If I was within 2 seconds, assume it's the same action */
    CloseRewards();  /* Finalize decision on quality of last action */

  QLastFormation  = Mem->GetCurrentFormationType();
  QLastActionFrom = Mem->GetPositionOfMyLocation(); /* Mem->GetMyPosition(); */
  QLastActionTo   = to;
  QLastActionVal  = val;
  QLastActionConf = conf;
  QLastFutureVal  = future;

  MyScore    = Mem->MyScore;
  TheirScore = Mem->TheirScore;
  Mem->GetBallGlobalXY(&BallX,&BallY);
  Time       = Mem->CurrentTime;

  AvgBallX          = BallX;
  AvgBallUpdateTime = Time;

  QActionTaken         = TRUE;
}

void RewardInfo::CloseRewards(){

  if ( !KeepLearning )
    return;

  if ( !QActionTaken ) my_error("no action to reward");

  /* Finalize decision on quality of last action: have things improved? */
  QTable *RewardQTable = GetQTable(QLastFormation,QLastActionFrom);   

  float reward;

  switch(Mem->PlayMode){
  case MY_KICK_OFF:
    reward = -100;
    break;
  case THEIR_KICK_OFF:
    reward = 100;
    break;
  case BEFORE_KICK_OFF:
    if ( Mem->MyScore > MyScore )
      reward = 100;
    else if ( Mem->TheirScore > TheirScore )
      reward = -100;
    break;
  case MY_GOAL_KICK:
    reward = -10; 
    break;
  case THEIR_GOAL_KICK:
    reward = 10; 
    break;
  case MY_CORNER_KICK:
    reward = 25; 
    break;
  case THEIR_CORNER_KICK:
    reward = -25; 
    break;
  case MY_KICK_IN:
    reward = 25*(Mem->GetBallGlobalX() + X0)/(2*X0); 
    break;
  case THEIR_KICK_IN:
    reward = -25*(X0 - Mem->GetBallGlobalX())/(2*X0); 
    break;
  case PLAY_ON:
    if ( AvgBallX > BallX )
      /* Fraction of availabe positive distance from starting point */
      reward = 10*(AvgBallX-BallX)/(X0-BallX);
    else /* AvgBallX <= BallX */
      /* Fraction of availabe negative distance from starting point */
      reward = -10*(BallX-AvgBallX)/(BallX+X0);
    break;
  default: 
    char msg[100];
    sprintf(msg,"What mode for getting reward???? (%d)",Mem->PlayMode);
    my_error(msg);
  }

  if (Mem->PlayMode != PLAY_ON){
    /* Lower reward based on how long it took to get there */
    /* Full reward within the first 5 seconds              */
    int FullRewardTime = 50;
    if (Mem->CurrentTime - Time > FullRewardTime){
      /* not lowered beyond factor of 10 */
      float divisor = 1 + 
	9*(Mem->CurrentTime-Time-FullRewardTime)/(MAX_REWARD_TIME-FullRewardTime);
      reward /= divisor; /* divisor between 1 and 10 */
    }
  }

#if GOAL_ONLY_REWARDS
  if (Mem->PlayMode != BEFORE_KICK_OFF)
    return;
  if ( Mem->MyScore > MyScore )
    reward = 100;
  else if ( Mem->TheirScore > TheirScore )
    reward = -100;
  float divisor = Mem->CurrentTime - Time;
  if (divisor > 0)
    reward /= divisor;
#endif

#if POS_5_LEARN
  if (Mem->GetMyPosition()==5 && Mem->GetPositionOfMyLocation()==5
      && QLastActionFrom==5){
    ;/*printf("%d - %d reward for %d (%.3f) = %.1f\n",
	   Mem->MyNumber,Mem->CurrentTime,QLastActionTo,QLastActionVal,reward);*/
  }
  else return;
#endif

  RewardQTable->UpdateQTable(QLastActionTo,QLastActionVal,QLastActionConf,
			     QLastFutureVal,reward);

  FILE *dataFile = fopen(RewardQTable->dataFileName,"w");
  while ( dataFile == NULL ){
    fclose(dataFile);
    dataFile = fopen(RewardQTable->dataFileName,"w");
  }
    
  RewardQTable->Write(dataFile);
  fclose(dataFile);

  QActionTaken = FALSE;
}

void RewardInfo::LookForRewards(){

  if ( !KeepLearning )
    return;

  if ( !QActionTaken ) my_error("no action for which to loof for rewards");

#if GOAL_ONLY_REWARDS
#else
  /* if it's been 30 seconds, close rewards */
  if ( Mem->CurrentTime - Time >= MAX_REWARD_TIME ) 
    CloseRewards();
#endif

  /* Don't want to divide by 0 -- nothing new has happened */
  if ( Mem->CurrentTime == Time )
    return;

  /* Here just tabulate ball positions */
  /* weighted by time in each place    */
  float previousWeight = AvgBallX * (AvgBallUpdateTime - Time);
  float currentWeight  = Mem->GetBallGlobalX() * (Mem->CurrentTime - AvgBallUpdateTime);

  AvgBallX = (previousWeight + currentWeight)/(Mem->CurrentTime - Time);
  AvgBallUpdateTime = Mem->CurrentTime;
}

/****************************************************************************/

int GetActionNumber(int index, int *actions, int *action_types){
  int action_number;
  if ( action_types[index] == RL_PASS_ACTION )
    action_number = Mem->GetPlayerPosition(actions[index]);
  else /* RL_KNOCK_ACTION */ /* knocks stored after passes */
    action_number = (actions[index]-MY_GOAL) + NUM_RL_PASS_ACTIONS; 

  return action_number;
}

/****************************************************************************/

int ChooseReceiverRandom(int NumActions, int *actions, int *action_types, float *FeatureValues, float *Confidences){

/*
  int passes = 0, knocks = 0;
  for (int i=0; i<NumActions; i++){
    if (action_types[i] == RL_KNOCK_ACTION)
      knocks++;
    else if (action_types[i] == RL_PASS_ACTION)
      passes++;
    else
      my_error("which type of action?");
  }
*/

  int NumOptions=0;
  int Options[NumActions];
  float maxVal = -20000;
  for (int i=0; i<NumActions; i++){
#if ONLY_KNOCKS
    if (action_types[i] == RL_PASS_ACTION) continue;
#endif
#if ONLY_PASSES
    if (action_types[i] == RL_KNOCK_ACTION && 
	Mem->GetMyLocationsPositionType() != FORWARD) continue;
#endif
#if BREAK_TIES_MAXQ
    if (FeatureValues[i] < maxVal) continue;
    /*else if (FeatureValues[i] > maxVal){*/
    else if (FeatureValues[i] > maxVal && Mem->GetMyLocationsPositionType() != FORWARD){
      /* Reset to only consider actions with the highest value */
      if (maxVal < DT_SUCCESS_CUTOFF) NumOptions=0;
      maxVal = MIN(DT_SUCCESS_CUTOFF,FeatureValues[i]);
    }
#endif
    Options[NumOptions++] = i;
  }
  
  if (!NumOptions) return NumActions-1; /* the knock to goal */
  else return Options[int_random(NumOptions)];
/*    
#if ONLY_KNOCKS
  passes=0;
#endif

#if ONLY_PASSES
  if (Mem->GetMyLocationsPositionType() != FORWARD){
    if (passes)
      return int_random(NumActions-knocks); 
    else
      return int_random(knocks)+NumActions-knocks;
  }
#endif

  /* 50% chance of shooting or passing */
/*  if (int_random(2) && passes) 
    return int_random(NumActions-knocks); 
  else
    return int_random(knocks)+NumActions-knocks;
*/
}

int ChooseReceiverMaxQ(int NumActions, int *actions, int *action_types, float *FeatureValues, float *Confidences){

  QTable *MyQTable = Mem->GetMyQTable(); 

#if 0
  int   action_index = NumActions; /* me */
  float maxQ = MyQTable->GetQValue(Mem->GetMyPosition(),
				   KNOCK_CONF,DEFAULT_FUTURE);
  /*printf("%d to self = %.1f\n",Mem->MyNumber,maxQ);*/
#endif

  int action_index = -1;
  float maxQ = -20000; /* Smaller than least possible */
  float maxVal = -20000;
  
  float Q;
  int action;
  for (int i=0; i<NumActions; i++){
#if BREAK_TIES_MAXQ
    if (FeatureValues[i] < maxVal) continue;
    //else if (FeatureValues[i] > maxVal){
    else if (FeatureValues[i] > maxVal && Mem->GetMyLocationsPositionType() != FORWARD){
      /* Reset to only consider actions with the highest value */
      if (maxVal < DT_SUCCESS_CUTOFF){
	maxQ = -20000;
	action_index = -1;
      }
      maxVal = MIN(DT_SUCCESS_CUTOFF,FeatureValues[i]);
    }
#endif
#if ONLY_KNOCKS
    if (action_types[i] != RL_KNOCK_ACTION)
      continue;
#endif
#if ONLY_PASSES
    if (Mem->GetMyLocationsPositionType() != FORWARD &&
	action_types[i] != RL_PASS_ACTION)
      continue;
#endif
    action = GetActionNumber(i,actions,action_types);
    Q = MyQTable->GetQValue(action,FeatureValues[i],DEFAULT_FUTURE);
#if TRUST_CONF
    //Q += QMAX;  /* So not negative -- done to all of them (0-200) */
    //Q *= (Confidences[i]+1)/2;
    Q *= ((Confidences[i]+1)/2 + 1);  /* so ranges from 1-2 */
    //Q *= ((Confidences[i]+1)/4 + 1);
    /* Result ranges 0-4, so not actual reward, but should correlate */
#endif
    if ( Q >= maxQ ){
      maxQ = Q;
      action_index = i;
    }
    /* printf("%d to %d (conf %.1f, val %.1f) = %.1f (was %.1f) action_index = %d\n",
	   MyQTable->Position,action,Confidences[i],FeatureValues[i],Q,
	   MyQTable->GetQValue(action,FeatureValues[i],DEFAULT_FUTURE),
	   action_index); */
  }
  if (action_index == -1){
    // my_error("Should have found SOME best action"); /* no passes */
    return ChooseReceiverRandom(NumActions,actions,action_types,
				FeatureValues,Confidences);
  }

#if BREAK_TIES_MAXQ
  /*printf("%d (%d) at %d options: ",Mem->MyNumber, MyQTable->Position, Mem->CurrentTime);
  for (int i =0; i<NumActions; i++){
    if (FeatureValues[i] == maxVal) 
      printf("%d ",GetActionNumber(i,actions,action_types));
  }
  printf("\n");*/
#endif
  
  return action_index;
}

int ChooseReceiverRandomized(int NumActions, int *actions, int *action_types, float *FeatureValues, float *Confidences){
  /* p is probability of choosing random action -- decreases over time            */
  /* decreasing p linearly to .5 at game 40, to .1 at game 80, to .01 at game 120 */

  float p;
  int game = Mem->CurrentTime/GAME_LENGTH;
  /* game += 120; */
  /* game = game/8; */
/*  if (game>=200){
    game=game%200;
    game=game/2;
  }*/
#if POS_5_LEARN
  if (Mem->GetMyPosition()!=5 || Mem->GetPositionOfMyLocation()!=5)
    game+=160;
#endif
  //game+=160;
  if ( game<=40 )
    p = 1 - .5*game/40;
  else if ( game<=80 )
    p = .5 - .4*(game-40)/40;
  else if ( game<=120 )
    p = .1 - .09*(game-80)/40;
  else
    p = .01;

  if ( range_random(0,1) <= p )
    return ChooseReceiverRandom(NumActions,actions,action_types,FeatureValues,Confidences);
  else 
    return ChooseReceiverMaxQ(NumActions,actions,action_types,FeatureValues,Confidences);
}

/****************************************************************************/

float GetHeuristicConfidence(int action, int action_type){

  float destR, destTheta;
  if (action_type == RL_PASS_ACTION){
    destR = Mem->GetTeammateDistance(action);
    destTheta = Mem->GetTeammateAngle(action);
  }
  else{ /* action_type == RL_KNOCK_ACTION */
    destR = MIN(40,Mem->GetMarkerDistance(action));
    destTheta = Mem->GetMarkerAngle(action);
  }
  
  /*printf("(%.1f,%.1f) ",Mem->GetGlobalX(),Mem->GetGlobalY());*/
  float result = 2;
  float angleDiff, distAlongLine, distFromLine;
  for (int i=1; i<=TEAM_SIZE; i++){
    if (!Mem->OpponentValid(i)) continue;
    /* Special case to ignore goalie for shots */
    if (i==1 &&
	Mem->MarkerValid(THEIR_GOAL) &&
        destR == Mem->GetMarkerDistance(THEIR_GOAL) &&
	destTheta  == Mem->GetMarkerAngle(THEIR_GOAL) ) continue;
    angleDiff = Mem->GetOpponentAngle(i) - destTheta;
    CleanAngle(&angleDiff);
    angleDiff = fabs(deg_to_rad(angleDiff));
    distAlongLine = Mem->GetOpponentDistance(i) * cos(angleDiff);
    distFromLine  = Mem->GetOpponentDistance(i) * sin(angleDiff);
    
    if (distAlongLine > destR) continue;
    if (distAlongLine < 0) continue;
    //if (distAlongLine > 40) continue;
    if (distFromLine  > 30) continue;
    if (distFromLine > distAlongLine) continue;
    
    /* printf("%d: %d angleDiff = %.1f (%.1f - %.1f), distAlong = %.1f, distFrom = %.1f\n",
	   action,i,rad_to_deg(angleDiff),Mem->GetOpponentAngle(i),destTheta,
	   distAlongLine,distFromLine);*/
    /*printf("%d(%.1f,%.1f) ",i,Mem->GetOpponentGlobalX(i),Mem->GetOpponentGlobalY(i));*/

    result *= .8*(distFromLine/(MIN(30,distAlongLine)));
  }
  result-=1;

  /*printf("%d:%d action = %d, result = %.1f\n",Mem->MyNumber, Mem->CurrentTime,
	 action,result);*/
  return result;
}

/****************************************************************************/

int RLtest(int NumActions, int *actions, int *action_types, float *Confidences){
  /* don't alter actions or confidences */
  /* Only alter Q's when you're training (players in fixed positions) */
  /* Start by loading 'em up for your position */
  /* If position switching, load for all */
  /* Take an action, return it, and then start looking for rewards */
  /* code for don't pass: me the receiver, passsuccessconf = 0 */

  float HConfidences[NumActions];
  for (int i=0; i<NumActions; i++)
    Confidences[i] = GetHeuristicConfidence(actions[i],action_types[i]);

  float FeatureValues[NumActions];
  for (int i=0; i<NumActions; i++)
    FeatureValues[i] = Confidences[i];

#if 0
  /* Used for 10-27 values */
  for (int i=0; i<NumActions; i++)
    FeatureValues[i] = MIN_ACTION_VALUE;
#endif

#if 1
  /* Only care if it's a success or failure */
  static int greater=0, less=0;
  for (int i=0; i<NumActions; i++){
    if ( FeatureValues[i] >= DT_SUCCESS_CUTOFF ){
      FeatureValues[i] = 0.45;
      greater++;
    }
    else{
      FeatureValues[i] = -0.45;
      less++;
    }
    if (!((greater+less)%30000))
      printf("%d : %d greater: %d, less: %d\n",Mem->MyNumber, Mem->CurrentTime,
	     greater,less);
  }
#endif

#if 0
  /* 2 states-- one for right-client, one for left                    */
  /* based on #opps on each side                                      */
  /* Actually 3 --and didn't correlate very well with opp client type */
  int opp_r=0, opp_l=0;
  for (int i=1; i<=TEAM_SIZE; i++){
    if ( !Mem->OpponentValid(i) )
      continue;
    if (Mem->GetOpponentGlobalY(i) > 0)
      opp_l++;
    else if (Mem->GetOpponentGlobalY(i) < 0)
      opp_r++;
  }
  float val = 0;
  if (opp_l > opp_r) val = .45;
  else if (opp_r > opp_l) val = -.45;
  for (int i=0; i<NumActions; i++){
      FeatureValues[i] = val;
      /* printf("%.1f ",FeatureValues[i]); */
  }
  /* printf("(%d)\n",Mem->CurrentTime); */
#endif

#if 0
  /* 2 states-- one for right-client, one for left                    */
  /* based on avg opponent y position                                 */
  /* Actually 3 --and didn't correlate very well with opp client type */
  float opp_y=0,x,y;
  for (int i=1; i<=TEAM_SIZE; i++){
    if ( Mem->GetOpponentGlobalSetTime(i) == 0)
      continue;
    Mem->GetOpponentLastGlobalXY(i,&x,&y);
    opp_y+=y;
  }
  float val = 0;
  if (opp_y > 0) val = .45;
  else if (opp_y < 0) val = -.45;
  for (int i=0; i<NumActions; i++){
      FeatureValues[i] = val;
      /* printf("%.1f ",FeatureValues[i]); */
  }
  /* printf("(%d)\n",Mem->CurrentTime); */
#endif

#if 0
  /* One DT value for right and left clients--time cued */
  float val = (Mem->CurrentTime/3000)%2 ? .45 : -.45;
  for (int i=0; i<NumActions; i++){
      FeatureValues[i] = val;
  }
#endif

#if 0
  /* 2 DT values randomly distributed */
  for (int i=0; i<NumActions; i++)
    FeatureValues[i] = int_random(2) ? .45 : -.45;
#endif

  int   receiver_index    = ChooseReceiverRandomized(NumActions,actions,action_types,FeatureValues,Confidences);
  //int   receiver_index    = ChooseReceiverMaxQ(NumActions,actions,action_types,FeatureValues,Confidences);
  //int   receiver_index    = ChooseReceiverRandom(NumActions,actions,action_types,FeatureValues,Confidences);

#if 0
  static int last_max_action = -1;
  if (Mem->GetMyPosition()==5 && Mem->GetPositionOfMyLocation()==5){
    int max_index = ChooseReceiverMaxQ(NumActions,actions,action_types,FeatureValues,Confidences);
    int max_action = GetActionNumber(max_index,actions,action_types);
    printf("%d: %d taking action %d\n",Mem->MyNumber,Mem->CurrentTime,
	     GetActionNumber(receiver_index,actions,action_types));
    if (max_action != last_max_action){
      printf("%d: %d max = %d (was %d)\n",Mem->MyNumber,Mem->CurrentTime,max_action,
	     last_max_action);
	     
      Mem->GetMyQTable()->Write(stdout);
    }
    last_max_action = max_action;
  }
#endif

#if 0
  /* To always shoot */
  if ( int_random(100) ){
    receiver_index = NumActions-1;        
    actions[receiver_index] = THEIR_GOAL;
    action_types[receiver_index] = RL_KNOCK_ACTION;
    FeatureValues[receiver_index] = MIN_ACTION_VALUE;
  }
  else 
    receiver_index    = ChooseReceiverRandom(NumActions,actions,action_types,FeatureValues,Confidences);
#endif

  int   receiver          = actions  [receiver_index];
  float feature_val       = FeatureValues[receiver_index];    
  float receiver_conf     = Confidences[receiver_index];    
  float receiver_future   = DEFAULT_FUTURE;
  int   action_number;  /* Was receiver_position */
  
 action_number = GetActionNumber(receiver_index,actions,action_types);

  if (action_number == UNKNOWN_POSITION)
    my_error("Should know position of player I'm passing to");

  int game = Mem->CurrentTime/GAME_LENGTH;
#if 0
  static int printed_game = 0;
  if (game>printed_game && Mem->MyNumber==6){
    printf("After Game %d (time %d ):\n",game,Mem->CurrentTime);
    Mem->GetQTable(Mem->GetCurrentFormationType(),5)->Write(stdout); 
    fflush(stdout);
    printed_game = game;
  }
#endif
#if POS_5_LEARN
  if (Mem->GetMyPosition()!=5 || Mem->GetPositionOfMyLocation()!=5)
    game=161;
#endif
  //game=161;
  if (game<=160 || game>=200)
    Mem->KeepLearning=TRUE;
  else
    Mem->KeepLearning=FALSE;

  if ( Mem->GetBallDistance() <= KICK_DISTANCE && Mem->KeepLearning){
    /* else can't actually act */
    /* printf("%d:%d (action %d)\n",Mem->MyNumber,Mem->CurrentTime,receiver_position);*/
    Mem->SetActionState(action_number,feature_val,receiver_conf,receiver_future);
  }

  if (0 && Mem->MyNumber==11){
    printf("%d %d ( %d ) taking action %d (dt: %.2f heur: %.2f )\n",
	   Mem->MyNumber, Mem->CurrentTime, Mem->GetPositionOfMyLocation(),
	   action_number,receiver_conf,HConfidences[receiver_index]);
    printf("Action   DT   Heur\n");
    for (int i=0; i<NumActions; i++){
      printf("%d  %f   %f\n",GetActionNumber(i,actions,action_types),
	     Confidences[i],HConfidences[i]);
    }
    printf("[ Opps seen: ");
    for (int i=1; i<=TEAM_SIZE; i++)
      if (Mem->OpponentValid(i)) printf("%d (%.1f %.1f) : ",i,Mem->GetOpponentGlobalX(i),
					Mem->GetOpponentGlobalY(i));
    printf(" ]\n");
    printf("[ Teammates seen: ");
    for (int i=1; i<=TEAM_SIZE; i++)
      if (Mem->TeammateValid(i)) printf("%d - %d (%.1f %.1f) : ",i,
					Mem->GetPlayerPosition(i),
					Mem->GetTeammateGlobalX(i),
					Mem->GetTeammateGlobalY(i));
    printf(" ]\n");
    printf("\n\n");
  }
  
  return receiver_index;
}

int RLforReceiver(int NumActions, int *actions, int *action_types, float *Confidences){
  return RLtest(NumActions,actions,action_types,Confidences);
}





