
%   File   : RDTOK.PL
%   Author : R.A.O'Keefe
%   Updated: 2 July 1984
%   Purpose: Tokeniser in reasonably standard Prolog.

/*  Adapted from shared code written by the same author; all changes */
/*  Copyright (C) 1988, Swedish Institute of Computer Science. */

/*  This tokeniser is meant to complement the library READ routine.
    It recognises Dec-10 Prolog with the following exceptions:

	%( is not accepted as an alternative to {

	%) is not accepted as an alternative to )

	NOLC convention is not supported (read_name could be made to do it)

	,.. is not accepted as an alternative to | (hooray!)

	large integers are not read in as xwd(Top18Bits,Bottom18Bits)

	After a comma, "(" is read as ' (' rather than '('.  This does the
	parser no harm at all, and the Dec-10 tokeniser's behaviour here
	doesn't actually buy you anything.  This tokeniser guarantees never
	to return '(' except immediately after an atom, yielding ' (' every
	other where.

    In particular, radix notation is EXACTLY as in Dec-10 Prolog version 3.53.
    Some times might be of interest.  Applied to an earlier version of this file:
	this code took			1.66 seconds
	the Dec-10 tokeniser took	1.28 seconds
	A Pascal version took		0.96 seconds
    The Dec-10 tokeniser was called via the old RDTOK interface, with
    which this file is compatible.  One reason for the difference in
    speed is the way variables are looked up: this code uses a linear
    list, while the Dec-10 tokeniser uses some sort of tree.  The Pascal
    version is the program WLIST which lists "words" and their frequencies.
    It uses a hash table.  Another difference is the way characters are
    classified: the Dec-10 tokeniser and WLIST have a table which maps
    ASCII codes to character classes, and don't do all this comparison
    and and memberchking.  We could do that without leaving standard Prolog,
    but what do you want from one evening's work?

    Experiment by matsc:
    encode character classes as integers & rely on unification:
    00 = whitespace
    10 = lowercase
    20 = uppercase
    30 = digit
    40 = symbol
    50 = solo
*/


%   read_tokens(TokenList, Dictionary)
%   returns a list of tokens.  It is needed to "prime" read_tokens/2
%   with the initial blank, and to check for end of file.  The
%   Dictionary is a list of AtomName=Variable pairs in no particular order.

read_tokens(TokenList, Dictionary) :-
	'$getch'(NextCh, NextTyp),
	'$prompt'(Old, '     '),
	read_tokens(NextTyp, NextCh, Dictionary, TokenList),
	'$prompt'(_, Old).


%   The only difference between read_after_atom(Typ, Ch, Dict, Tokens) and
%   read_tokens/4 is what they do when Ch is "(".  read_after_atom
%   finds the token to be '(', while read_tokens finds the token to be
%   ' ('.  This is how the parser can tell whether <atom> <paren> must
%   be an operator application or an ordinary function symbol application.
%   See the library file READ.PL for details.

read_after_atom(50, 0'(, Dict, ['('|Tokens]) :- !,
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_after_atom(Typ, Ch, Dict, Tokens) :-
	read_tokens(Typ, Ch, Dict, Tokens).




%   read_string(Typ, Ch, Chars, Quote, NextCh, NextTyp)
%   reads the body of a string delimited by Quote characters.
%   The result is a list of ASCII codes.  There are two complications.
%   If we hit the end of the file inside the string this predicate succeeds,
%   returning the end of file character as NextCh.
%   The other complication is that when we find a Quote
%   we have to look ahead one character in case it is doubled.
%   If we were going to accept C-like escape characters, as I think we
%   should, this would need changing (as would the code for 0'x).  But
%   the purpose of this module is not to present my ideal syntax but to
%   present something which will read present-day Prolog programs.


read_string(00, -1, [], _, 00, -1) :- !.	% end of file
read_string(50, Quote, Chars, Quote, NextTyp, NextCh) :- !,
	'$getch0'(Ch, Typ),			% closing or doubled quote
	more_string(Typ, Ch, Quote, Chars, NextTyp, NextCh).
read_string(_, Char, [Char|Chars], Quote, NextTyp, NextCh) :-
	'$getch0'(Ch, Typ),			% ordinary character
	read_string(Typ, Ch, Chars, Quote, NextTyp, NextCh).


more_string(50, Quote, Quote, [Quote|Chars], NextTyp, NextCh) :- !,
	'$getch0'(Ch, Typ),			% doubled quote
	read_string(Typ, Ch, Chars, Quote, NextTyp, NextCh).
more_string(NextTyp, NextCh, _, [], NextTyp, NextCh). % end


read_tokens(X, _, _, _) :- var(X), !, fail.	% space saver
read_tokens(00, Ch, Dict, Tokens) :-		% ignore layout.
	(   Ch > -1 ->
              '$getch'(NextCh, NextTyp),
	      read_tokens(NextTyp, NextCh, Dict, Tokens)
	;   eq(Tokens,[end_of_file])		% EOF
        ).
read_tokens(10, Ch0, Dict, [Token|Tokens]) :-
	'$getch0'(Ch, Typ),
	read_name(Typ, Ch, S0, NextCh, NextTyp),
	atom_token([Ch0|S0], Token),
	read_after_atom(NextTyp, NextCh, Dict, Tokens).
read_tokens(20, Ch0, Dict, [var(Var,S)|Tokens]) :-
	eq(S,[Ch0|S0]),
	'$getch0'(Ch, Typ),
	read_name(Typ, Ch, S0, NextCh, NextTyp),
	dic_lookup(Dict, S, Node),		%  lookup/enter in dictionary
        rdtok_var(Node, Var),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(30, Ch, Dict, [number(I)|Tokens]) :-
	read_number(Ch, I, Dict, Tokens).	%  reads continuation too.
read_tokens(40, 0'/, Dict, Tokens) :- !,	%  /*comment?
	'$getch0'(NextCh, NextTyp),
	read_solidus(NextTyp, NextCh, Dict, Tokens).
read_tokens(40, 0'., Dict, Tokens) :- !,	%  full stop
	'$getch0'(NextCh, NextTyp),		%  or possibly .=. &c
	read_fullstop(NextTyp, NextCh, Dict, Tokens).
read_tokens(40, Ch, Dict, [Token|Tokens]) :-
	'$getch0'(AnotherCh, Typ),
	read_symbol(Typ, AnotherCh, Chars, NextCh, NextTyp), % might read 0 chars
	atom_token([Ch|Chars], Token),		% so might be [Ch]
	read_after_atom(NextTyp, NextCh, Dict, Tokens).
read_tokens(50, Ch, Dict, Tokens) :-
	read_tokens(Ch, Dict, Tokens).

read_tokens(X, _, _) :- var(X), !, fail.	% space saver
read_tokens(0'%, Dict, Tokens) :-		%  %comment
        '$skip'(0'
             ),
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'!, Dict, [atom(!)|Tokens]) :-	%  This is a special case so
	'$getch'(NextCh, NextTyp),		%  that !. reads as two tokens.
	read_after_atom(NextTyp, NextCh, Dict, Tokens).	%  It could be cleverer.
read_tokens(0'(, Dict, [' ('|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'), Dict, [')'|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0',, Dict, [','|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0';, Dict, [atom(;)|Tokens]) :-	%   ; is nearly a punctuation
	'$getch'(NextCh, NextTyp),		%   mark but not quite (e.g.
						%   you can :-op declare it).
	read_after_atom(NextTyp, NextCh, Dict, Tokens).	%   was read_tokens --MC
read_tokens(0'[, Dict, ['['|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'], Dict, [']'|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_after_atom(NextTyp, NextCh, Dict, Tokens).	%   was read_tokens --MC
read_tokens(0'{, Dict, ['{'|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'|, Dict, ['|'|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'}, Dict, ['}'|Tokens]) :-
	'$getch'(NextCh, NextTyp),
	read_after_atom(NextTyp, NextCh, Dict, Tokens).	%   was read_tokens --MC
read_tokens(0'", Dict, [string(S)|Tokens]) :-	%  "string"
	'$getch0'(Ch, Typ),
	read_string(Typ, Ch, S, 0'", NextTyp, NextCh),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'', Dict, [Token|Tokens]) :-	%  'atom'
	'$getch0'(Ch, Typ),
	read_string(Typ, Ch, S, 0'', NextTyp, NextCh),
	atom_token(S, Token),			%  NOT name/2 !!
	read_after_atom(NextTyp, NextCh, Dict, Tokens).
read_tokens(0'_, Dict, [var(Var,S)|Tokens]) :-	%  have to watch out for "_"
	eq(S,[0'_|S0]),
	'$getch0'(Ch, Typ),
	read_name(Typ, Ch, S0, NextCh, NextTyp),
	(  eq(S0,  "") ->				%  anonymous variable
	     true
	;  dic_lookup(Dict, S, Node),		%  lookup/enter in dictionary
	   rdtok_var(Node, Var)
	),
	read_tokens(NextTyp, NextCh, Dict, Tokens).



%   read_solidus(Typ, Ch, Dict, Tokens)
%   checks to see whether /Ch is a /* comment or a symbol.  If the
%   former, it skips the comment.  If the latter it just calls read_symbol.
%   [We have to take great care with /* comments to handle end of file
%    inside a comment, which is why read_solidus/2 passes back an end of
%    file character or a (forged) blank that we can give to read_tokens.
%    Flushed for now. --matsc]

read_solidus(40, 0'*, Dict, Tokens) :- !,
	'$skip'(0'*),
	'$getch0'(Ch, Typ),
	read_solidus_2(Typ, Ch, Dict, Tokens).
read_solidus(Typ, Ch, Dict, [Token|Tokens]) :-
	read_symbol(Typ, Ch, Chars, NextCh, NextTyp), % might read 0 chars
	atom_token([0'/|Chars], Token),
	read_after_atom(NextTyp, NextCh, Dict, Tokens).	% was read_tokens --MC

read_solidus_2(40, 0'/, Dict, Tokens) :- !,
	'$getch'(NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_solidus_2(40, 0'*, Dict, Tokens) :- !,
	'$getch0'(Ch, Typ),
	read_solidus_2(Typ, Ch, Dict, Tokens).
read_solidus_2(_, _, Dict, Tokens) :-
	'$skip'(0'*),
	'$getch0'(Ch, Typ),
	read_solidus_2(Typ, Ch, Dict, Tokens).

%   read_name(Typ, Char, String, LastCh, LastTyp)
%   reads a sequence of letters, digits, and underscores, and returns
%   them as String.  The first character which cannot join this sequence
%   is returned as LastCh.

read_name(10, Char, String, LastCh, LastTyp) :- !,
	eq(String,  [Char|Chars]),
	'$getch0'(NextCh, NextTyp),
	read_name(NextTyp, NextCh, Chars, LastCh, LastTyp).
read_name(20, Char, String, LastCh, LastTyp) :- !,
	eq(String,  [Char|Chars]),
	'$getch0'(NextCh, NextTyp),
	read_name(NextTyp, NextCh, Chars, LastCh, LastTyp).
read_name(30, Char, String, LastCh, LastTyp) :- !,
	eq(String,  [Char|Chars]),
	'$getch0'(NextCh, NextTyp),
	read_name(NextTyp, NextCh, Chars, LastCh, LastTyp).
read_name(50, 0'_, String, LastCh, LastTyp) :- !,
	eq(String,  [0'_|Chars]),
	'$getch0'(NextCh, NextTyp),
	read_name(NextTyp, NextCh, Chars, LastCh, LastTyp).
read_name(LastTyp, LastCh, [], LastCh, LastTyp).



%   read_symbol(Typ, Ch, String, NextCh, NextTyp)
%   reads the other kind of atom which needs no quoting: one which is
%   a string of "symbol" characters.  Note that it may accept 0
%   characters, this happens when called from read_fullstop.

read_symbol(40, Char, String, LastCh, LastTyp) :- !,
	eq(String,  [Char|Chars]),
	'$getch0'(NextCh, NextTyp),
	read_symbol(NextTyp, NextCh, Chars, LastCh, LastTyp).
read_symbol(LastTyp, LastCh, [], LastCh, LastTyp).


%   read_fullstop(Typ, Char, Dict, Tokens)
%   looks at the next character after a full stop.  There are
%   three cases:
%	(a) the next character is an end of file.  We treat this
%	    as an unexpected end of file.  The reason for this is
%	    that we HAVE to handle end of file characters in this
%	    module or they are gone forever; if we failed to check
%	    for end of file here and just accepted .<EOF> like .<NL>
%	    the caller would have no way of detecting an end of file
%	    and the next call would abort.
%	(b) the next character is a layout character.  This is a
%	    clause terminator.
%	(c) the next character is anything else.  This is just an
%	    ordinary symbol and we call read_symbol to process it.

read_fullstop(00, Ch, _, Tokens) :- !,		% END OF CLAUSE
	(   Ch > -1 -> eq(Tokens,  [.])
	;   eq(Tokens,  [atom(.)])
        ).
read_fullstop(Typ, Ch, Dict, [Token|Tokens]) :-
	read_symbol(Typ, Ch, S, NextCh, NextTyp),
	atom_token([0'.|S], Token),
	read_after_atom(NextTyp, NextCh, Dict, Tokens).	% was read_tokens --MC



%   read_number is complicated by having to understand radix notation.
%   There are three forms of integer:
%	0 ' <any character>	- the ASCII code for that character
%	<digit> ' <digits>	- the digits, read in that base
%	<digits>		- the digits, read in base 10.

/* New number tokenizer, by AC and MC. */
read_number(Ch, Value, Dict, Tokens) :-
	read_int(Ch, 30, S-S, Chars, NextCh, NextTyp),
	read_after_int(NextCh, NextTyp, Chars, Value, Dict, Tokens).

read_after_int(0'', _, Chars-[], Value, Dict, Tokens) :- !, % 999'<rest>
	name(Base, Chars),
	read_based_int(Base, Value, NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_after_int(0'., _, Chars, Value, Dict, Tokens) :- !, % 999.<rest>
	'$getch0'(Ch1, Typ),
	read_after_int_period(Ch1, Typ, Chars, Value, Dict, Tokens).
read_after_int(Ch, Typ, Chars-[], Value, Dict, Tokens) :- % 999
	name(Value, Chars),
	read_tokens(Typ, Ch, Dict, Tokens).

read_after_int_period(Ch, 30, Chars-[0'.|S], Value, Dict, Tokens) :- !,
	read_int(Ch, 30, Chars-S, Chars1, NextCh1, NextTyp1),
	read_after_fraction(NextCh1, NextTyp1, Chars1, Value, NextCh, NextTyp),
	read_tokens(NextTyp, NextCh, Dict, Tokens).
read_after_int_period(Ch, Typ, Chars-[], Value, Dict, Tokens) :- % integer followed by period.
	name(Value, Chars),
	read_fullstop(Typ, Ch, Dict, Tokens).

read_after_fraction(0'E, _, Chars-[0'E|S], Value, NextCh, NextTyp) :- !,
	'$getch0'(Ch1, Typ),
	read_after_fraction1(Ch1, Typ, Chars-S, Value, NextCh, NextTyp).
read_after_fraction(0'e, _, Chars-[0'e|S], Value, NextCh, NextTyp) :- !,
	'$getch0'(Ch1, Typ),
	read_after_fraction1(Ch1, Typ, Chars-S, Value, NextCh, NextTyp).
read_after_fraction(Ch, Typ, Chars-[], Value, Ch, Typ) :-
	name(Value, Chars).

read_after_fraction1(0'-, _, Chars-[0'-|S], Value, NextCh, NextTyp) :- !,
	'$getch0'(Ch1, Typ), 
	read_after_fraction2(Ch1, Typ, Chars-S, Value, NextCh, NextTyp).
read_after_fraction1(0'+, _, Chars-[0'+|S], Value, NextCh, NextTyp) :- !,
	'$getch0'(Ch1, Typ),
	read_after_fraction2(Ch1, Typ, Chars-S, Value, NextCh, NextTyp).
read_after_fraction1(Ch, Typ, Chars, Value, NextCh, NextTyp) :-
	read_after_fraction2(Ch, Typ, Chars, Value, NextCh, NextTyp).

read_after_fraction2(Ch, Typ, Chars, Value, NextCh, NextTyp) :-
	read_int(Ch, Typ, Chars, Chars1-[], NextCh, NextTyp),
	name(Value, Chars1).

read_based_int(0, Value, NextCh, NextTyp) :- !,
	'$getch0'(Value, _),
	'$getch'(NextCh, NextTyp).
read_based_int(Base, Value, NextCh, NextTyp) :-
	Base >= 2,
	Base =< 36, !,
	'$getch0'(Ch, Typ),
	read_name(Typ, Ch, Chars, NextCh, NextTyp),
%	'$prolog_radix'(R, Base),
	(   number_chars(Value, Chars, Base) -> true
%	(   number_chars(Value, Chars) -> true
%	(   name(Value, Chars) -> true
	;   eq(Value,  Chars)
	).
%	'$prolog_radix'(Base, R).
read_based_int(Value, Value, 0'', 50).

read_int(Digit, 30, S0-[Digit|S], Chars, NextCh, NextTyp) :- !,
	'$getch0'(Ch, Typ),
	read_int(Ch, Typ, S0-S, Chars, NextCh, NextTyp).
read_int(LastCh, LastTyp, Value, Value, LastCh, LastTyp).

atom_token(L, Token) :-
%	atom_chars(A, L), !,
	name(A, L), !,
	eq(Token,  atom(A)).
atom_token(L, badatom(L)). 

rdtok_var(Node, Var) :-
        var(Node), !, eq(Node,  [Var|_]).
rdtok_var([Var], Var).

dic_lookup([[B|C]|D],B,C) :- !.
dic_lookup([F|G],H,I) :- dic_lookup(G,H,I).
