//
// Lexer.java: Main lexer class.
// 
// Author: Mosur Ravishankar
// Date: 18 Mar 1998
//


// This partial implementation of the lexer demonstrates the use of:
//    1. abstract classes and methods
//    2. polymorphism (invoking the nextState method of the individual
//       FSM state classes).
//    3. exceptions
//    4. interfaces
// 
// The Lexer is implemented as an FSM.  Each distinct lexer FSM state is a subclass
// of the base abstract class LexerState.
// The set of FSM states includes SpaceState (that gobbles up adjacent whitespaces),
// internal states that the lexer may be in while gathering a token, and a FinalState
// when a token has been assembled.  Each concrete state overrides the abstract
// nextState method of the base class to implement its state transition function.
// The method returns the next (destination) state given the current character input.
// 
// For each getToken() call, the lexer starts from a new FSM search (from an implicit
// initial state).  If this is SpaceState, it keeps calling nextState (i.e., making
// state transitions) until a FinalState has been reached indicating the end of white
// space region.  Once a non-space token has been started, it keeps calling nextState
// until the current state is a FinalState, indicating the end of the token.  It then
// bundles up the token type and the associated character string (lexeme) into a Token
// object.  Since the FSM does not distinguish keywords from identifiers, the lexer
// needs to additionally convert to keyword token type if necessary.  The token
// object is then returned.
// 
// Again, note that lexer states are implemented as subclasses of the base abstract class
// LexerState.  Each of them overrides the abstract method nextState.  The lexer class
// does not care about the individual states.  Its curState variable is simply the
// base class type.  However, when curState.nextState() is invoked, polymorphism goes
// to work and the nextState transition function of the appropriate subclass is
// actually invoked.  Thus the structure of the lexer and state transition function
// implementation is very simple.

// The getToken function can throw 4 types of exceptions:
//   IOException: Cannot do much about these anyway.
//   EOFException: "Normal" exception on EOF, to be caught by caller.
//   InputException: Illegal input to lexer, to be caught by caller.
//   LexerStateException: Internal error (bug!).  This should never happen.

// Finally, note that the lexer class implements the interface PushBackIntf.
// As we know, the state transition functions need one character lookahead to determine
// the end of token and to transition to the final state.  Sometimes they need to push
// back the lookahead character if it is not part of the current token being scanned.
// This main Lexer class implements the pushBack method of this interface so that the
// states can tell the lexer to push back the last character scanned.


import java.io.*;

public class Lexer implements PushBackIntf, TokenType
{
  private BufferedReader in;	// Input being read
  private int lineNo;		// Current line number
  private String line;		// Current line being processed
  private int pos;		// Position of last character scanned
  private boolean eof;		// Whether end of file reached
  private LexerState curState;	// Current state in the lexer FSM.
  
  // Return the next character from the input.  Note that the input is actually read
  // using readLine(), and individual characters from the line are returned.
  private char getNextChar() throws IOException, EOFException
    {
      if (eof)
	throw new EOFException("EOF");
      
      // Position of next character to be read
      pos++;
      
      // If not reached end of line, just return the appropriate char.
      if (pos < line.length())
	return line.charAt(pos);
      
      // If at end of line, insert a space character at the end of the line
      // (Otherwise there would be no whitespace between adjacent lines.)
      if (pos == line.length())
	return ' ';
      
      // If beyond end of line, read in a new line and return its first character.
      line = in.readLine();
      eof = (line == null) ? true : false;
      pos = -1;
      lineNo++;
      return getNextChar();	// Tail recursion.
    }
  
  // Push back one character by simply rolling back pos by one.
  public void pushBack()
    {
      pos--;
    }
  
  // Get the next token from the input
  public Token getToken ()
    throws IOException, EOFException, LexerStateException, InputException
    {
      char c;
      int spos, epos;
      String lexeme;
      Token token;
      
      if (eof)
	throw new EOFException("EOF");
      
      // Enter the appropriate new initial state for the current input character
      curState = LexerState.newState(getNextChar());

      // If entered whitespace, skip all of it.
      if (curState instanceof SpaceState) {
	do {
	  curState = curState.nextState(getNextChar(), this);
	} while (curState instanceof SpaceState);
	
	// This if should never succeed; only here for debugging.
	if ((! (curState instanceof FinalState)) ||
	    ( ((FinalState)curState).getTokenType() != T_SPACE)) {
	  throw new LexerStateException();
	}
	
	// State corresponding to first non-whitespace character found
	curState = LexerState.newState(getNextChar());
      }
      
      // Start of actual token; scan until entered final state
      spos = pos;
      while (! (curState instanceof FinalState)) {
	try {
	  c = getNextChar();
	}
	catch (EOFException e) {
	  // Insert an additional space in case EOF happened right after token.
	  // The inserted space should terminal the current token.
	  c = ' ';
	}
	curState = curState.nextState(c, this);
      }
      epos = pos;
      
      // Create a new token object for this lexeme; note that tokens cannot cross
      // a line boundary.
      token = new Token(((FinalState)curState).getTokenType(),
		       line.substring(spos, epos+1));

      // If FSM said it was an identifier, check if it is actually keyword.
      // If so, convert token type to appropriate value.
      if (token.getType() == T_IDENT)
	token.checkKeyword();
      
      return (token);
    }

  // For error reporting
  public void printLine() 
    {
      System.out.println ("Line " + lineNo + ":");

      // Print the current line
      System.out.println (eof ? "" : line);

      // Mark the current position with a ^
      if (line != null) {
	for (int i = 0; (i < pos) && (i < line.length()); i++) {
	  if (line.charAt(i) == '\t')
	    System.out.print ("\t");
	  else
	    System.out.print (" ");
	}
      }
      
      System.out.println ("^");
    }
  
  // Constructor
  public Lexer(BufferedReader in)
    {
      this.in = in;
      lineNo = 0;
      line = "";	// Empty line
      pos = 0;		// last character read
      eof = false;
    }
}
