
(* make a char stream into a Token.token stream - Tom 7 *)
structure Tokenize :> TOKENIZE =
struct

    exception ParseError of string

    fun checklegal c = 
	Token.islegal c
	   orelse raise ParseError ("bad character #" ^ Int.toString (ord c))

    local open Token
    in
	fun gettag "EM" = EM
	  | gettag "PL" = PL
	  | gettag "S" = S
	  | gettag "B" = Att B
	  | gettag "I" = Att I
	  | gettag "TT" = TT
	  | gettag "U" = U
	  | gettag s =
	    case explode s of
		[c] => if c >= #"0" andalso c <= #"9"
		    then Num (ord c - ord #"0")
		    else (case StringUtil.indexof colors c of
			      ~1 => raise ParseError ("bad tag: " ^ s)
			    | i => Clr i)
	      | _ => raise ParseError ("unknown tag: " ^ s)

	fun gtloose "S" = S
	  | gtloose "B" = Att B
	  | gtloose "I" = Att I
	  | gtloose "TT" = TT
	  | gtloose "U" = U
          | gtloose "PRE" = TT
	  | gtloose s = PL

        val gtloose = gtloose o StringUtil.ucase

    end

    fun tokenizer gt csi =
	let
	    fun tks (loc : int) (l : int) =
              if loc >= size csi then
                if loc = l then nil 
                  else [Token.Text (String.substring(csi, l, loc - l))]
              else case CharVector.sub (csi, loc) of
                #"<" => (case loc - l of
                           0 => parsetag (loc+1)
                         | n => Token.Text (String.substring(csi, l, n)) ::
                                   (parsetag (loc+1)))
              | c =>
                  let in
                    checklegal c;
                    tks (loc+1) l
                  end
	    and parsetag loc =
              if loc >= size csi then
                raise ParseError "unfinished tag"
              else case CharVector.sub (csi, loc) of
                #"/" => parsetagname true (loc+1) (loc+1)
              | #">" => raise ParseError "empty tag!"
              | c => 
                let in
                  checklegal c;
                  parsetagname false loc (loc+1)
                end
	    and parsetagname isend l loc =
              if loc >= size csi then
                raise ParseError "unterminated tag"
              else case CharVector.sub (csi, loc) of
                #">" => 
                ((if isend then Token.Closetag else Token.Tag)
                    (gt (String.substring(csi, l, loc-l))))
                           :: tks (loc+1) (loc+1)
              | c => 
                   let in
                     checklegal c;
                     parsetagname isend l (loc+1)
                   end
	in
	    tks 0 0
	end

    val tokenize = tokenizer gettag
    val tokenize_loose = tokenizer gtloose

end