(* MinML lexer *)
(* Written as a stream transducer *)

signature LEXER =
sig
    datatype token =
        INT
      | BOOL
      | ARROW
      | TRUE
      | FALSE
      | FN
      | REC
      | FUN
      | DARROW
      | IF
      | THEN
      | ELSE
      | FI
      | LET
      | LETTYPE
      | IN
      | END
      | COMMA
      | DOT
      | LBRACK
      | RBRACK
      | LPAREN
      | RPAREN
      | SEMICOLON
      | NEGATE
      | EQUALS
      | LESSTHAN
      | TIMES
      | MINUS
      | PLUS
      | COLON
      | PERCENT
      | FLOAT
      | AFN
      | ROLL
      | UNROLL
      | PACK
      | OPEN
      | INL
      | INR
      | CASE
      | OF
      | BAR
      | FST
      | SND
      | SQRT
      | ITOF
      | ABORT
      | ALL
      | EXISTS
      | MU
      | UNIT
      | VOID
      | NUMBER of int
      | FLT of real
      | VAR of string

    val eq : token * token -> bool

    val lex : char Stream.stream -> token Stream.stream

    val tokenToString : token -> string
end;  (* signature LEXER *)


structure Lexer :> LEXER =
struct

    structure S = Stream

    exception LexError of string

    datatype token =
        INT
      | BOOL
      | ARROW
      | TRUE
      | FALSE
      | FN
      | REC
      | FUN
      | DARROW
      | IF
      | THEN
      | ELSE
      | FI
      | LET
      | LETTYPE
      | IN
      | END
      | COMMA
      | DOT
      | LBRACK
      | RBRACK
      | LPAREN
      | RPAREN
      | SEMICOLON
      | NEGATE
      | EQUALS
      | LESSTHAN
      | TIMES
      | MINUS
      | PLUS
      | COLON
      | PERCENT
      | FLOAT
      | AFN
      | ROLL
      | UNROLL
      | PACK
      | OPEN
      | INL
      | INR
      | CASE
      | OF
      | BAR
      | FST
      | SND
      | SQRT
      | ITOF
      | ABORT
      | ALL
      | EXISTS
      | MU
      | UNIT
      | VOID
      | FLT of real
      | NUMBER of int
      | VAR of string

    fun eq (FLT r, FLT r2) = Real.==(r,r2)
      | eq(NUMBER n,NUMBER n') = n = n'
      | eq(VAR s , VAR s') = s = s'
      | eq (INT,INT) = true
      | eq (BOOL, BOOL) = true
      | eq(ARROW,ARROW) = true
      | eq(TRUE,TRUE) = true
      | eq(FALSE, FALSE) = true
      | eq(FN,FN) = true
      | eq(REC,REC) = true
      | eq(FUN,FUN) = true
      | eq(DARROW,DARROW) = true
      | eq(IF,IF) = true
      | eq(THEN,THEN) = true
      | eq(ELSE,ELSE) = true
      | eq(FI,FI) = true
      | eq(LET,LET) = true
      | eq(LETTYPE,LETTYPE) = true
      | eq(IN,IN) = true
      | eq(END,END) = true
      | eq(COMMA,COMMA) = true
      | eq(DOT,DOT) = true
      | eq(LBRACK,LBRACK) = true
      | eq(RBRACK,RBRACK) = true
      | eq(LPAREN,LPAREN) = true
      | eq(RPAREN,RPAREN) = true
      | eq(SEMICOLON,SEMICOLON) = true
      | eq(NEGATE,NEGATE) = true
      | eq(EQUALS,EQUALS) = true
      | eq(LESSTHAN,LESSTHAN) = true
      | eq(TIMES,TIMES) = true
      | eq(MINUS,MINUS) = true
      | eq(PLUS,PLUS) = true
      | eq(COLON,COLON) = true
      | eq(PERCENT,PERCENT) = true
      | eq(FLOAT,FLOAT) = true
      | eq(AFN,AFN) = true
      | eq(ROLL,ROLL) = true
      | eq(UNROLL,UNROLL) = true
      | eq(PACK,PACK) = true
      | eq(OPEN,OPEN) = true
      | eq(INL,INL) = true
      | eq(INR,INR) = true
      | eq(CASE,CASE) = true
      | eq(OF,OF) = true
      | eq(BAR,BAR) = true
      | eq(FST,FST) = true
      | eq(SND,SND) = true
      | eq(SQRT,SQRT) = true
      | eq(ITOF,ITOF) = true
      | eq(ABORT,ABORT) = true
      | eq(ALL,ALL) = true
      | eq(EXISTS,EXISTS) = true
      | eq(MU,MU) = true
      | eq(UNIT,UNIT) = true
      | eq(VOID,VOID) = true
      | eq _ = false

    fun next s =
        case S.force s of
            S.Nil => raise LexError "Unexpected end of stream."
          | S.Cons result => result
   
    (* foldfamily 
     : ('a -> bool) -> ('a * 'b -> 'b) -> 'b -> ('b -> 'c) ->
       ('a * 'a stream) -> ('c * 'a stream) 

       This handles a string of characters matching a certain
       function, folding them into a result.
    *)
    fun foldfamily test base combine wrap (c, ins) =
        let
            fun ff b s =
                case next s of
                    (c, ss) =>
                        if test c then
                            ff (combine (c, b)) ss
                        else (wrap b, s)
        in
            ff (combine (c, base)) ins
        end
        
    (* Assumes ASCII-like ordering *)
    fun isnum c = ord c >= ord #"0" andalso ord c <= ord #"9"
    fun isalpha #"_" = true
      | isalpha #"'" = true
      | isalpha c = (ord c >= ord #"a" andalso ord c <= ord #"z")
        orelse      (ord c >= ord #"A" andalso ord c <= ord #"Z")
        orelse      isnum c

    fun token (#",", s) = (COMMA, s)
      | token (#"(", s) = (LPAREN, s)
      | token (#")", s) = (RPAREN, s)
      | token (#";", s) = (SEMICOLON, s)
      | token (#"~", s) = (NEGATE, s)
      | token (#".", s) = (DOT, s)
      | token (#"[", s) = (LBRACK, s)
      | token (#"]", s) = (RBRACK, s)
      | token (#"|", s) = (BAR, s)
      | token (#"=", s) = 
        (* might be equals or the start of a double arrow *)
        (case next s of
           (#">", s) => (DARROW, s)
         | _ => (EQUALS, s))
      | token (#"<", s) = (LESSTHAN, s)
      | token (#"*", s) = (TIMES, s)
      | token (#"-", s) = 
        (* might be minus or the start of an arrow *)
        (case next s of
             (#">", s) => (ARROW, s)
           | _ => (MINUS, s))
      | token (#"+", s) = (PLUS, s)
      | token (#":", s) = (COLON, s)
      | token (#"%", s) = (PERCENT, s)
      | token (c, s) =
             if isnum c then
               let
                 val (numAsStr,s') = 
                   (foldfamily 
                    (fn x => isnum x orelse x = #".")
                    []
                    op::
                    (String.concat o rev o (map str))
                    (c, s))
               in
                 (if List.exists (fn #"." => true | _ => false) (String.explode numAsStr) then
                   FLT(valOf(Real.fromString numAsStr))
                 else 
                   NUMBER(valOf(Int.fromString numAsStr)), s')
               end
             else if isalpha c then
                 (foldfamily
                  isalpha
                  ""
                  (fn (a, b) => b ^ str a)
                  VAR
                  (c, s))
             else raise LexError "illegal character"

    (* some "variables" are actually keywords *)
    and keywords (VAR("int")) = INT
      | keywords (VAR("float")) = FLOAT
      | keywords (VAR("bool")) = BOOL
      | keywords (VAR("true")) = TRUE
      | keywords (VAR("false")) = FALSE
      | keywords (VAR("fn")) = FN
      | keywords (VAR("Fn")) = AFN
      | keywords (VAR("rec")) = REC
      | keywords (VAR("fun")) = FUN
      | keywords (VAR("if")) = IF
      | keywords (VAR("then")) = THEN
      | keywords (VAR("else")) = ELSE
      | keywords (VAR("fi")) = FI
      | keywords (VAR("let")) = LET
      | keywords (VAR("lettype")) = LETTYPE
      | keywords (VAR("in")) = IN
      | keywords (VAR("end")) = END
      | keywords (VAR("roll")) = ROLL
      | keywords (VAR("unroll")) = UNROLL
      | keywords (VAR("pack")) = PACK
      | keywords (VAR("open")) = OPEN
      | keywords (VAR("inl")) = INL
      | keywords (VAR("inr")) = INR
      | keywords (VAR("case")) = CASE
      | keywords (VAR("of")) = OF
      | keywords (VAR("fst")) = FST
      | keywords (VAR("snd")) = SND
      | keywords (VAR("sqrt")) = SQRT
      | keywords (VAR("itof")) = ITOF
      | keywords (VAR("abort")) = ABORT
      | keywords (VAR("All")) = ALL
      | keywords (VAR("Exists")) = EXISTS
      | keywords (VAR("Mu")) = MU
      | keywords (VAR("unit")) = UNIT
      | keywords (VAR("void")) = VOID
      | keywords t = t

    and lex (s : char S.stream) : token S.stream =
        S.delay (fn () => lex' (S.force s))

    (* process characters, skipping whitespace *)
    and lex' S.Nil = S.Nil
      | lex' (S.Cons (#" ", s)) = lex' (S.force s)
      | lex' (S.Cons (#"\r", s)) = lex' (S.force s)
      | lex' (S.Cons (#"\t", s)) = lex' (S.force s)
      | lex' (S.Cons (#"\v", s)) = lex' (S.force s)
      | lex' (S.Cons (#"\n", s)) = lex' (S.force s)
      | lex' (S.Cons r) =
        let
            val (t, s) = token r
        in 
            S.Cons (keywords t, lex s)
        end

    fun tokenToString t = 
        case t of
            INT => "INT"
          | BOOL => "BOOL"
          | ARROW => "ARROW"
          | TRUE => "TRUE"
          | FALSE => "FALSE"
          | FN => "FN"
          | REC => "REC"
          | FUN => "FUN"
          | DARROW => "=>"
          | IF => "IF"
          | THEN => "THEN"
          | ELSE => "ELSE"
          | FI => "FI"
          | LET => "LET"
          | LETTYPE => "LETTYPE"
          | IN => "IN"
          | END => "END"
          | COMMA => "COMMA"
          | LPAREN => "LPAREN"
          | RPAREN => "RPAREN"
          | SEMICOLON => "SEMICOLON"
          | NEGATE => "NEGATE"
          | EQUALS => "EQUALS"
          | LESSTHAN => "LESSTHAN"
          | TIMES => "TIMES"
          | MINUS => "MINUS"
          | PLUS => "PLUS"
          | COLON => "COLON"
	  | PERCENT => "PERCENT"
	  | FLOAT => "FLOAT"
	  | AFN => "AFN"
	  | ROLL => "ROLL"
	  | UNROLL => "UNROLL"
	  | PACK => "PACK"
	  | OPEN => "OPEN"
	  | INL => "INL"
	  | INR => "INR"
	  | CASE => "CASE"
	  | FST => "FST"
	  | SND => "SND"
	  | SQRT => "SQRT"
	  | ITOF => "ITOF"
	  | ABORT => "ABORT"
	  | ALL => "ALL"
	  | EXISTS => "EXISTS"
	  | MU => "MU"
	  | UNIT => "UNIT"
	  | VOID => "VOID"
	  | OF => "OF"
	  | DOT => "DOT"
	  | BAR => "BAR"
	  | LBRACK => "LBRACK"
	  | RBRACK => "RBRACK"
          | FLT f => "FLT(" ^ Real.toString f ^ ")"
          | NUMBER n => "NUMBER(" ^ Int.toString n ^ ")"
          | VAR v => "VAR(" ^ v ^ ")"

end;  (* structure Lexer *)
