/*  TOKENISE.PL  */


:- module tokenise.


:- public chars_to_tokens/2,
          read_tokens/1,
          token_contents/2,
          define_token_error/1,
          default_token_error/2.


/*
SPECIFICATION
-------------

This module defines the predicate chars_to_tokens, for converting lists
of characters into Prolog tokens, and the predicate read_tokens, for
reading tokens from a file.


PUBLIC chars_to_tokens( L+, Tokens? ):
--------------------------------------

Unifies Tokens with a list of tokens formed by tokenising L, a list of
character codes. See below for a specification of the token
representation.

Tokens are read one-by-one from L until reaching its end. Unlike
read_tokens below, this predicate does not stop at the end of a clause,
and it delivers a token for all full-stops, even if they are clause
terminators.

If it detects an error, this predicate fails, and reports the error
as described below.


PUBLIC read_tokens( Tokens? ):
------------------------------

Reads from the current input stream up to the end of the next full-stop
that can end a clause, returning the result as a list of tokens. The
list includes all the tokens up to but not including the clause
terminator, so the clause "a:-b." would yield three tokens, for a, :-,
and b.

Note that unlike chars_to_tokens, this predicate stops at the end of a
clause.

It handles errors in the same way as chars_to_tokens.


PUBLIC token_contents( Token+, Term? ):
---------------------------------------

Converts from a token to the corresponding "raw" term, i.e.
    atom(A) -> A
    var(V)  -> V
as in the table below.


Token representation.
---------------------

Each token is represented in the following way:
    Atom             -- if Atom (the token) is ) [ ] { } | ,
    '('              -- if the token is ( immediately preceded by
                        an atom.
    ' ('             -- if the token is ( otherwise.
    atom(Atom)       -- if Atom is an unquoted other atom.
    var(Atom)        -- if Atom starts with a capital letter or an
                        underline.
    quoted_atom(Atom)-- for an atom that was enclosed in single quotes.
    string(List)     -- for something that was enclosed in double
                        quotes.
    integer(N)       -- integers.
    real(N)          -- reals.

Thus,
    a('B',C,"D")
would be tokenised as
    Tokens = [ atom(a), '(', quoted_atom(B),
               ',',  var(C), ',', string([68]), ')'
             ]

This representation is similar to that expected by Richard O'Keefe's
READ parser and by my modified version READ_TERM of it. However, they
are not identical, and READ_TERM contains a conversion predicate for
mapping between the two. The representation is also similar to that
produced by the original RDTOK tokeniser from which I derived this
module; but again, they are not identical.

The distinction between '(' and ' (' is so the parser can tell whether
<atom> ( is an operator application or an ordinary functor application.


Token syntax.
-------------

Layout characters between tokens are ignored, except that the tokeniser
distinguishes operator applications from functor applications by their
spacing: if there are no characters separating f from (, f is assumed to
be a normal functor, and not an operator.

All characters are assumed to be printable (including space). The effect
of tabs, deletes, backspaces, nulls etc is undefined. Files may include
one end-of-file character, which will be assumed to end the file (what
did you expect?). If this occurs inside a character list, it's taken to
end the list, even if there are other characters following.

Two kinds of comment are recognised. /* */ comments are skipped entirely,
as in standard Prolog. % comments are skipped up to the next newline
character (i.e. one for which is_newline_char(C) holds) or end-of-file.


1) Tokens.

<token> ::= <atom>
          | <number>
          | <punctuation>
          | <string>
          | <variable>


2) Atoms.

<atom> ::= <normal_atom>
         | <quoted_atom>
         | <symbol_atom>
         | !
         | ;

<normal_atom> ::= <lowercase_or_underline> <letter_underline_or_digit>*

<quoted_atom> ::= ' <atom_string_body> '

<atom_string_body> ::= <atom_string_char>*

<atom_string_char> ::= ''
<atom_string_char> ::= <printable_character>

<symbol_atom> ::= <symbol_start> <symbol_char>*

<symbol_start> ::= ... any printable character except a letter, digit,
                       underline, punctuation, percent, slash
                       (starting a comment), shriek, semicolon
                       or double or single quote ...

<symbol_char> ::= ... any one of #$&*+-./:<=>?@\^`~ ...


3) Numbers.

<number> ::= <integer>
           | <real>

<integer> ::= <base_10_integer>
            | <character_constant>
            | <based_integer>

<base_10_integer> ::= <simple_integer>

<character_constant> ::= 0 ' <printable_character>

<based_integer> ::= <base> ' <rest_of_based>

<base> ::= <simple_integer>

<rest_of_based> ::= ... any sequence of digits, underlines, or letters.
                        Digits represent themselves, letters represent
                        10 (for a and A) to 35 (for z and Z). Underlines
                        are filler characters, with no significance.
                        The number terminates as soon as a digit or letter
                        larger than or equal to the base has been read ...

<real> ::= <digit>* . <simple_integer>* <exponent>?

<exponent> ::= e <sign>? <simple_integer>

<simple_integer> ::= <digit> <digit>*


4) Punctuation tokens.

<punctuation> ::= ( | ) | [ | ] | { | } | | | ,


5) Strings.

<string> ::= " <string_string_body> "


6) Variables.

<variable> ::= <anonymous_var>
             | <named_var>

<anonymous_var> ::= _

<named_var> ::= <uppercase_or_underline> <letter_underline_or_digit>*


7) Miscellaneous definitions.

<letter_underline_or_digit> ::= a letter (either case), a digit, or a _ ...

<lowercase_or_underline> ::= ... a lower-case letter or a _ ...

<printable_character> ::= ... any printable character, including space ...

<uppercase_or_underline> ::= ... a upper-case letter or a _ ...


Token errors.
-------------

When a tokenising predicate fails, it calls a private predicate
    token_error( Message, Input, Tokens )
whose arguments are: a message about the error; the input being
tokenised; and the tokens constructed before the one where the error
was detected.

By default, this predicate calls
    default_token_error( Message, Input, _ )
which reports the error in the form
    ** Illegal digit/letter in based number
    X is 2'103
(for input X is 2'103).

You can make it call another predicate P with these arguments by using
define_token_error(P). You will have to define P to be of arity three;
usually, you will either write out the error immediately, or save it
for future output, as the Tutor does.

Having called P, token_error forces the main tokenising predicate to
fail. It does not matter therefore whether P fails or succeeds.


PUBLIC default_token_error( Message+, Input+, Tokens+ ):

Message is a list of atoms or numbers, being the token error message to
be reported. It will be one of these lists:
    ['End-of-file in string']
    ['End-of-file following escape character in string']
    ['End-of-file inside comment']
    ['Illegal number base before single quote']
    ['Illegal digit/letter in based number']
    ['Non-digit in exponent']
(If writing out the lists yourself, leave one space between each element.
At the moment, all the lists have only one element, but future versions
of the tokeniser might expand them to include extra information.)

Input is a list of character codes, being the input being tokenised,
with any end-of-file character stripped from the end. The last character
read may be missing.

Tokens is a list of tokens, same format as successfully returned from
read_tokens/1, being all the tokens created before the error was
detected.

default_token_error writes the error in the format shown above, message
first and then input. It ignores Tokens.    


PUBLIC define_token_error( P+ ):

Causes P( Message, Input, Tokens ) to be called when a token error
occurs. This is called at the point in input where the error was
detected.
*/


/*
IMPLEMENTATION
--------------

The code below is adapted from the DEC-10 Prolog library public-domain
tokeniser RDTOK.PL, together with some parts from the updated version in
Richard O'Keefe's book "The Craft of Prolog". It would be a good idea to
look at that if you need to modify or understand this code.

I have made a number of changes:

1. Input.
---------

Input can come either from the current input stream, or from a list. The
predicates distinguish between these possibilities by the Source
parameter. This is 'file(_,_)' for the CIS, and 'string(L,_)' for the
list of character codes L.

The initial values of these structures, i.e. what you pass to
read_tokens/3, should be either file([],[]) or string(ListOfChars,[]).

There are two input operations: read_nextch and putback.

To read a new character, call read_nextch(Source0,Source1,C1). If Source0
is
    file([],Previous)
, this calls get0(C1), and sets Source1 to
    file([], [C1|Previous])

If Source0 is
    string(L,Previous)
this takes the first character from L, setting Source1 to
    string(RestOfL,[C1|Previous])

In both cases, the Previous argument holds the characters so far read,
in reverse order. This is used when reporting errors, and at no other
time. If you are tokenising very long inputs, you might want to shorten
Previous every so often by (say) never letting it grow to more than 80
in size.

Thus the state of a list is completely represented by the Source
parameter. For a file of course, the state is held by the operating
system, so the Source parameter is a bit of a cheat. However, it does
let us use the same predicates for both files and character lists, which
is essential.

When tokenising, you don't need to know more anything about the internal
structure of Sources - just call read_nextch or putback. For further
details on implementation, see the end of this file.


2. When to stop.
----------------

The original tokeniser read tokens until coming to the end of a clause,
or falling off the end of file. However, the current module has
different stopping conditions depending on whether you call
'read_tokens' or 'chars_to_tokens'. The latter ignores clause boundaries
and continues until the end of the character list. This is necessary for
the Tutor because not all input is Prolog clauses: we use the tokeniser
for other kinds of sentence.

This has meant several changes, the most pervading of which is......                


3. Program inversion.
---------------------

In the original tokeniser, the main predicate was read_tokens. It called
subsidiary predicates, which would call read_tokens recursively to
continue. I have inverted this so that the main predicate is read_token,
which reads just _one_ token. It has to be called repeatedly by
read_tokens.

Doing this makes it easier to deal with the different stopping
conditions mentioned above.


4. Character codes.
-------------------

Unlike O'Keefe, we can't guarantee our Prolog will have character
constants (he assumes that, e.g. 0'A means 65. So I have replaced all
explicit character constants by calls to CHARS.PL. Occasionally these
predicates are used to return characters, not test them.


5. Error-handling.
------------------

I have added the mechanism described above.


Portability:
------------

(A) Character constants were mentioned above.

(B) Check the member call in read_symbol. The string there contains
a backslash, which on my system must be written as \\. Your system
may have different conventions.

(C) You may need to accomodate different syntaxes for some tokens if
your Prolog's syntax differs from what's implemented here. Look out
for escapes in strings and atoms. I have only implemented \' \" and \\
for the two quotes and backslash itself. This is a subset of what
my system gives me, and it's adequate for students. Change read_char
to change the escape action.

(D) Numbers. This is the worst part. As I took this over from O'Keefe's
code, the predicate read_number reads numbers, and converts _integers_
to their value by doing the standard multiply-by-10-and-add trick. It
does not try to convert reals at all, but instead relies on the
predicate
    number_chars( N, Digits )
which converts between an integer or real in N and a digit-sequence
(possibly with decimal point, exponent, etc) in Digits. Think of it
as an analogue to name/2.

I have not had time to write read_number in standard Prolog, and have
fallen back on Poplog. With luck, you will be able to persuade your
Prolog to call some alien routine that can do the conversion. If not, it
is possible to code in Prolog alone, but you need to take great care:
simple-minded routines often introduce large errors. The following book
gives some hints:
    Algol 68 transput
    (part II: an implementation model)
    by J.C. van Vliet
    Mathematisch Centrum
    Amsterdam 1979
    ISBN 90 6196 179 3

The following two news items from sci.math also sound useful. I haven't
checked the references.

    ``In 1990, Guy L. Steele of Thinking Machines Corp. and Jon L. White
    of Lucid published a paper in the SIGPLAN '90 conference
    proceedings, entitled "How to Print Floating-Point Numbers
    Accurately". The paper contains various algorithms which will print
    a floating point number accurately, according to certain criteria
    (for example, minimum number of digits needed to reconstruct the
    exact bit pattern, low error bound, etc.)

    The paper is very readable and is an instant classic, IMHO, together
    with its companion paper "How to Read Floating-Point Numbers
    Accurately" in the same proceedings.

    At the end of their paper, they mention that a "Portable C
    implementation is in progress."''

    (Stephan Neuhaus)

    `` ``Instant classic''? As you can see by opening up your copy of
    Knuth, someone by the name of Taranto discovered ``how to print
    floating point numbers accurately'' before 1960. I have a perfectly
    portable C implementation of Taranto's method, for any radix-2
    floating point machine: it prints a C ``double'' to any precision
    you want---perfect accuracy down to the last digit if you provide
    space---without using anything but standard ``double'' operations.
    It's faster than Sun's printf (which also seems to have been written
    for accuracy, but probably takes advantage of extended precision)
    for values near 1; I haven't optimized it for values which need a
    lot of scaling to get close to 1, though it'll still produce correct
    results in a reasonable time. I've donated the code to CSRG for BSD
    4.4.''

    (Dan Bernstein)

Efficiency.
-----------

This is not great, I wrote the code for portability. You could make
read_token table-driven, which would speed it up somewhat.
*/


/*  Load Pop-11 code in TOKENISE.P. Omit this if not using Poplog.  */
:- pop_compile( 'tokenise.p' ).


:- needs
    append/3,
    reverse/2,
    bug/2,
    member/2,
    file(chars).


:- dynamic
    '$token_error'/3,
    token_error/3.


/*
Top-level routines.
-------------------
*/


read_tokens( TokenList ) :-
    read_tokens( file([],[]), stop_on_end_of_clause, TokenList ).


chars_to_tokens( Chars, TokenList ) :-
    read_tokens( string(Chars,[]), stop_on_end_of_list, TokenList ).


/*  read_tokens( Source+, WhenToStop+, TokenList? ):
        This handles input from both files and lists. Source is
        'file([],[])' for input from the current input stream,
        'string(List,[])' otherwise.

        WhenToStop is either 'stop_on_end_of_clause' (this implements
        read_tokens) or 'stop_on_end_of_file' (which implements
        chars_to_tokens).
*/
read_tokens( Source, WhenToStop, TokenList ) :-
    is_space_char( Space ),
    /*  Need to prime read_tokens/5 with an innocuous character...  */
    read_tokens( Space, Source, WhenToStop, not_after_atom, End, End ),
    !,
    TokenList = End. 
    /*  ... this makes it steadfast.  */


/*
The main tokeniser.
-------------------

I have followed the following naming conventions:
C0, Source0:    the initial character and source on entry to a
                predicate.
CN, SourceN:    intermediate values calculated inside a predicate,
                often by calling read_nextch.
C, Source  :    those returned by it.
WhenToStop :    see above.
State0     :    indicates whether the token about to be read immediately
                follows an atom. This will affect whether ( is tokenised
                as '(' or ' (' (see specification). The values are
                { after_atom, not_after_atom, end } where 'end' is returned
                only on an end-of-file.
State      :    its new value, returned by the predicate.


Error-handling.
---------------

When a predicate detects an error, it calls
    note_token_error( Message, Source )
which saves its parameters by asserting them into the database. The
caller should then ensure that read_token fails; it will usually do this
by failing itself.

The predicate read_tokens which calls read_token repeatedly tests for
this failure. If it occurs, it calls
    token_error( Message, Source, TokensSoFar )

By default, token_error calls default_token_error, which displays the
error message and position of the error. However, you can change this by
calling define_token_error(P) as described in the specification part.
*/


/*  read_tokens( C0+, Source0+, WhenToStop+, State0+, Tokens-, End- ):
        The main driving predicate for read_token. Calls it inside
        a loop, deciding whether or not to continue at each
        cycle.

        Tokens and End form a difference list. The list of tokens is
        built up in Tokens. We do things this way so that this partial
        list is available when an error is detected.
*/
read_tokens( C0, Source0, WhenToStop, State0, Tokens, End0 ) :-
    read_token( C0, Source0, State0, Token, C1, Source1, State1 ),
    !,
    (
        State1 = end,
        WhenToStop = stop_on_end_of_clause
    ->
        End0 = [end_of_file]
    ;
        State1 = end
    ->
        End0 = []
    ;
        WhenToStop = stop_on_end_of_clause,
        Token = end_of_clause
    ->
        End0 = []
    ;
        Token = end_of_clause
    ->
        End0 = [atom('.')|End1],
        read_tokens( C1, Source1, WhenToStop, State1, Tokens, End1 )
    ;
        End0 = [Token|End1],
        read_tokens( C1, Source1, WhenToStop, State1, Tokens, End1 )
    ).

read_tokens( _, _, _, _, TokensSoFar, [] ) :-
    token_error( TokensSoFar ),
    fail.


/*  read_token( C0+, Source0+, State0, Token-, C-, Source-, State- ):
        C0 starts a token whose characters follow in Source0.
        Return the token in Token, the character following it in
        C, and the new source in Source.

        The after atom states are State0 and State.

        Fail if error detected.
*/
read_token( C0, Source0, _, _, C0, Source0, end ) :-
    is_end_of_file_char(C0),
    !.

read_token( C0, Source0, _, Token, C, Source, State ) :-
    is_layout_char(C0),
    read_nextch( Source0, Source1, C1 ),
    !,
    read_token( C1, Source1, not_after_atom, Token, C, Source, State ).

read_token( C0, Source0, _, Token, C, Source, State ) :-
    is_percent_char(C0),
    !,
    read_nextch( Source0, Source1, C1 ),
    skip_to_nl_or_eof( C1, Source1, C2, Source2 ),
    read_token( C2, Source2, Token, C, Source, State ).
    /*  As this is, it does not fail if an end-of-file occurs inside
        the comment. This seems OK to me (the comment might be the
        last line of the file). However, if you prefer them to fail,
        it's easy to make them do so by inserting the line
            not( is_end_of_file_char(C2) )
        before the call to read_token.
    */

read_token( C0, Source0, _, Token, C, Source, State ) :-
    is_slash_char(C0),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_solidus( C1, Source1, Token, C, Source, State ).

read_token( C0, Source0, _, atom(!), C, Source, after_atom ) :-
    is_shriek_char(C0),
    !,
    /*  Special case so that !. reads as two tokens.  */
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, not_after_atom, ' (', C, Source, not_after_atom ) :-
    is_left_round_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, after_atom, '(', C, Source, not_after_atom ) :-
    is_left_round_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, ')', C, Source, not_after_atom ) :-
    is_right_round_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, ',', C, Source, not_after_atom ) :-
    is_comma_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, atom(;), C, Source, not_after_atom ) :-
    is_semicolon_char(C0),
    !,
    /*  ; is nearly a punctuation mark but not quite (e.g., you can
        "op" declare it.
    */
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, '[', C, Source, not_after_atom ) :-
    is_left_square_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, ']', C, Source, not_after_atom ) :-
    is_right_square_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, '{', C, Source, not_after_atom ) :-
    is_left_curly_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, '|', C, Source, not_after_atom ) :-
    is_stick_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, '}', C, Source, not_after_atom ) :-
    is_right_curly_bracket_char(C0),
    !,
    read_nextch( Source0, Source, C ).

read_token( C0, Source0, _, Token, C, Source, State ) :-
    is_dot_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_fullstop( C1, Source1, Token, C, Source, State ).

read_token( C0, Source0, _, string(S), C, Source, not_after_atom ) :-
    is_double_quote_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_string( C1, Source1, C0, S, C, Source ).

read_token( C0, Source0, _, quoted_atom(A), C, Source, after_atom ) :-
    is_single_quote_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_string( C1, Source1, C0, S, C, Source ),
    name( A, S ).

read_token( C0, Source0, _, var(Name), C, Source, not_after_atom ) :-
    ( is_uppercase_char(C0) ; is_underline_char(C0) ),
    !,
    /*  Have to watch out for "_".  */
    read_name( C0, Source0, S, C, Source ),
    name( Name, S ).

read_token( C0, Source0, _, N, C, Source, State ) :-
    is_digit_char(C0),
    !,
    read_number( C0, Source0, N, C, Source, State ).

read_token( C0, Source0, _, atom(A), C, Source, after_atom ) :-
    is_lowercase_char(C0),
    !,
    /*  No corresponding problem with _ here.  */
    read_name( C0, Source0, S, C, Source ),
    name( A, S ).

read_token( C0, Source0, _, atom(A), C, Source, after_atom ) :-
    /*  This must be the last clause.  */
    read_nextch( Source0, Source1, C1 ),
    read_symbol( C1, Source1, Chars, C, Source ),
    /*  Might read 0 chars.  */
    name( A, [C0|Chars] ).


/*
Strings.
--------
*/


/*  read_string( C0+, Source0+, Quote+, Chars-, C-, Source- ):
        Reads the body of a string delimited by Quote characters from
        Source0. The result is returned in Chars, with C being the
        character following the string, and Source being the
        corresponding source.
*/
read_string( Quote, Source0, Quote, [], C, Source ) :-
    read_nextch( Source0, Source, C ).

read_string( C0, Source0, _, _, _, _ ) :-
    is_end_of_file_char( C0 ),
    !,
    note_token_error(['End-of-file in string'],Source0),
    fail.

read_string( C0, Source0, Quote, [Char|Chars], C, Source ) :-
    !,
    read_char( C0, Source0, Quote, Char, C1, Source1 ),
    read_string( C1, Source1, Quote, Chars, C, Source ).


/*  read_char( C0+, Source0+, Quote+, Char-, C-, Source- ):
        C0 is the first character of something representing one character
        in a string. If it's an escape, read more characters and decode them,
        otherwise return it.
*/
read_char( C0, Source0, Quote, Char, C, Source ) :-
    is_backslash_char(C0),
    !,
    read_nextch( Source0, Source1, C1 ),
    escape_char( Source1, C1, Char ),
    read_nextch( Source1, Source, C ).

read_char( C0, Source0, Quote, C0, C, Source ) :-
    !,
    read_nextch( Source0, Source, C ).


escape_char( _, C, C ) :-
    is_single_quote_char(C) ;
    is_backslash_char(C) ;
    is_double_quote_char(C).

escape_char( Source, C, _ ) :-
    is_end_of_file_char(C),
    !,
    note_token_error(['End-of-file following escape character in string'],Source),
    fail.


/*
Comments.
---------
*/


/*  read_solidus( CPre, C0, Source+, Tokens-, State- ):
        Checks to see whether /C is a /star comment or a symbol. If the
        former, it skips the comment. If the latter it just calls
        read_symbol. RDTOK took great care with /star comments to
        handle end of file inside a comment, which is why read_solidus/2
        passes back an end of file character or a (forged) blank for
        passing to read_token. I call 'bug', but the rest of the code
        is unchanged.

        Fail on eof.
*/
read_solidus( C0, Source0, Token, C, Source, State ) :-
    is_star_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_solidus( C1, Source1, C2, Source2 ),
    read_token( C2, Source2, not_after_atom, Token, C, Source, State ).

read_solidus( C0, Source0, atom(A), C, Source, after_atom ) :-
    read_symbol( C0, Source0, Chars, C, Source ),
    /*  Might read 0 chars.  */
    is_slash_char( First ),
    name( A, [First|Chars] ).


read_solidus( C0, Source0, _, _ ) :-
    is_end_of_file_char( C0 ),
    !,
    note_token_error( ['End-of-file inside comment'], Source0 ),
    fail.

read_solidus( C0, Source0, C, Source ) :-
    is_star_char( C0 ),
    read_nextch( Source0, Source1, C1 ),
    (
        is_slash_char(C1)
    ->
        is_space_char( C ),
        Source = Source1
    ;
        /*  Might be eof or * though.  */
        read_solidus( C1, Source1, C, Source )
    ).

read_solidus( _, Source0, C, Source ) :-
    read_nextch( Source0, Source1, C1 ),
    read_solidus( C1, Source1, C, Source ).


/*
Names.
------
*/


/*  read_name( C0+, Source0+, String-, C-, Source- ):
        Reads a sequence of letters, digits, and underscores, and
        returns them as String. C0 is included as the first character of
        String. The first character which cannot join this sequence is
        returned as C.
*/
read_name( C0, Source0, [C0|Chars], C, Source ) :-
    ( is_letter_or_digit_char( C0 ) ; is_underline_char( C0 ) ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_name( C1, Source1, Chars, C, Source ).

read_name( C0, Source0, [], C0, Source0 ).


/*
Symbol atoms.
-------------
*/


/*  read_symbol( C+, Source0+, String-, C-, Source- ):
        Reads the other kind of atom which needs no quoting: one which
        is a string of "symbol" characters. Note that it may accept 0
        characters, as is possible when called after reading a full stop.
*/
read_symbol( C0, Source0, [C0|Chars], C, Source ) :-
    member( C0, "#$&*+-./:<=>?@\\^`~" ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_symbol( C1, Source1, Chars, C, Source ).

read_symbol( C0, Source0, [], C0, Source0 ).


/*
Misc. skipping.
---------------
*/


skip_to_nl_or_eof( C0, Source0, C0, Source0 ) :-
    ( is_newline_char( C0 ) ; is_end_of_file_char( C0 ) ),
    !.

skip_to_nl_or_eof( C0, Source0, C, Source ) :-
    read_nextch( Source0, Source1, C1 ),
    skip_to_nl_or_eof( C1, Source1, C, Source ).


/*
Reading numbers.
----------------
*/


/*  read_number( C0+, Source0+, Token-, C-, Source- ) :-
        Reads a number which starts with C0 from Source0, returning
        it in Token. The character after it is returned in C, and the
        corresponding source in Source.

        Token will be integer(N) or real(N), depending on the type.

        Integers are easy to convert: we do it inside read_simple_number,
        by reading each digit and adding to the number being formed.
        Non-decimal numbers are treated similarly inside read_base.
*/


/*  read_number( C0+, Source0+, Token-, C-, Source-, State ):
*/
read_number( C0, Source0, Token, C, Source, State ) :-
    read_simple_number( C0, Source0, 0, N1, C1, Source1 ),
    read_after_number( C1, Source1, N1, N, C, Source, State ),
    (
        integer(N)
    ->
        Token = integer(N)
    ;
        Token = real(N)
    ).


/*  read_simple_number( C0+, Source0+, N0+, N-, C-, Source- ):
        Read a decimal integer.
*/
read_simple_number( C0, Source0, N0, N, C, Source ) :-
    is_digit_char( C0, Ch0Val ),
    !,
    N1 is N0*10 + Ch0Val,
    read_nextch( Source0, Source1, C1 ),
    read_simple_number( C1, Source1, N1, N, C, Source ).

read_simple_number( C0, Source0, N0, N0, C0, Source0 ).


read_after_number( C0, Source0, N0, N, C, Source, not_after_atom ) :-
    is_single_quote_char( C0 ),
    !,
    (
        N0 >= 2, N0 =< 36
    ->
        read_based( N0, Source0, 0, N, C, Source )
    ;
        N0 = 0
    ->
        read_nextch( Source0, Source1, C1 ),
        read_char( C1, Source1, -1, N, C, Source )
    ;
        note_token_error(['Illegal number base before single quote'],Source0),
        fail
    ).

read_after_number( C0, Source0, N0, N, C, Source, State ) :-
    is_dot_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    (
        is_digit_char( C1 )
    ->
        number_chars( N0, Digits ),
        read_float( C1, Source1, Digits, N, C, Source, State )
    ;
        N = N0,
        putback( C1, Source1, Source ),
        C = C0,
        State = not_after_atom
    ).

read_after_number( C0, Source0, N0, N0, C0, Source0, not_after_atom ).


/*
Reading numbers to other bases.
-------------------------------
*/


/*  read_based( Base+, Source0+, N0+, N-, C-, Source- ):
        Read a number in Base into N.
*/
read_based( Base, Source0, N0, N, C, Source ) :-
    read_nextch( Source0, Source1, C1 ),
    (
        char_gives_digit( C1, Digit )
    ->
        (
            Digit < Base
        ->
            N1 is N0*Base + Digit,
            read_based( Base, Source1, N1, N, C, Source )
        ;
            note_token_error(['Illegal digit/letter in based number'],Source1),
            fail
        )
    ;
        is_underline_char( C1 )
    ->
        read_based( Base, Source1, N0, N, C, Source )
    ;
        N = N0,
        C = C1,
        Source = Source1
    ).


char_gives_digit( C, Digit ) :-
    is_digit_char( C, Digit ),
    !.

char_gives_digit( C, Digit ) :-
    ( is_uppercase_char( C, Ord ) ; is_lowercase_char( C, Ord ) ),
    !,
    Digit is Ord+9.


/*
Reading real numbers.
---------------------
*/


read_float( C0, Source0, BeforePoint, N, C, Source, read_tokens ) :-
    read_float( C0, Source0, AfterPoint, C, Source ),
    append( BeforePoint, ".", BeforePointAndPoint ),
    append( BeforePointAndPoint, AfterPoint, NumberChars ),
    number_chars( N, NumberChars ).


read_float( C0, Source0, [C0|Chars], C, Source ) :-
    read_nextch( Source0, Source1, C1 ),
    (
        is_digit_char( C1 )
    ->
        read_float( C1, Source1, Chars, C, Source )
    ;
        is_little_e_char( C1 )
    ->
        read_nextch( Source1, Source2, C2 ),
        read_signed_exponent( C2, Source2, Exponent, C, Source ),
        Chars = [C1|Exponent]
    ;
        Chars = [],
        C = C1,
        Source = Source1
    ).


read_signed_exponent( C0, Source0, Exponent, C, Source ) :-
    is_plus_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_exponent_number( C1, Source1, Exponent, C, Source ).

read_signed_exponent( C0, Source0, [C0|Exponent], C, Source ) :-
    is_minus_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_exponent_number( C1, Source1, Exponent, C, Source ).

read_signed_exponent( C0, Source0, Exponent, C, Source ) :-
    read_exponent_number( C0, Source0, Exponent, C, Source ).


read_exponent_number( C0, Source0, [C0|Rest], C, Source ) :-
    is_digit_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_rest_exponent( C1, Source1, Rest, C, Source ).

read_exponent_number( C0, Source0, Exponent, C, Source ) :-
    note_token_error(['Non-digit in exponent'],Source0),
    fail.


read_rest_exponent( C0, Source0, [C0|Rest], C, Source ) :-
    is_digit_char( C0 ),
    !,
    read_nextch( Source0, Source1, C1 ),
    read_rest_exponent( C1, Source1, Rest, C, Source ).

read_rest_exponent( C0, Source0, [], C0, Source0 ).


/*
Converting between digit-lists and numbers.
-------------------------------------------

This is the non-portable bit I warned about earlier. Given the existing
definition of read_number, you need number_chars to be able to convert
character lists to reals, and integers to character lists; for
completeness, I've made it do the others as well.
*/


number_chars( N, Digits ) :-
    nonvar(N),
    !,
    prolog_eval( number_to_chars(N), Digits ).

number_chars( N, Digits ) :-
    nonvar(Digits),
    !,
    prolog_eval( chars_to_number(Digits), N ).


/*
Full stops.
-----------
*/


/*  read_fullstop( C0+, Source0+, Token-, C-, Source-, State- ):
        C0 is the character following the fullstop. Token is set to the
        token whose first character is a fullstop, which is followed by
        C. Thus, this predicate handles floating-point numbers which
        start with a decimal point, and symbol atoms which start with a
        fullstop.

*/
read_fullstop( C0, Source0, number(N), C, Source, State ) :-
    is_digit_char( C0 ),
    !,
    read_float( C0, Source0, "0", N, C, Source, State ).

read_fullstop( C0, Source0, end_of_clause, C0, Source0, not_after_atom ) :-
    is_layout_char( C0 ),
    !.

read_fullstop( C0, Source0, end_of_clause, C0, Source0, not_after_atom ) :-
    is_end_of_file_char( C0 ),
    !.
    /*  As with % comments, I see nothing wrong with allowing an
        end-of-file immediately after a clause. How can you know whether
        your filing system will put a newline there or not?
        It also makes sense when reading tokens from lists.
        It's easy to change this to give an error if that is what you
        want.
    */

read_fullstop( C0, Source0, atom(A), C, Source, after_atom ) :-
    read_symbol( C0, Source0, Chars, C, Source ),
    is_dot_char( Dot ),
    name( A, [Dot|Chars] ).


/*
Reading characters from files and lists.
----------------------------------------

It's occasionally useful to be able to read several characters ahead and
then return them to the source for re-reading. This is simulated by
"putback" (an idea taken from BCPL). Putting characters back onto a list
is trivial. To put them back to a file, we just augment the source
(normally the atom "file") with a list of putback characters.
*/


/*  read_nextch( Source0+, Source-, C- ):
        C is the next character from Source0; Source represents the
        new source after getting C.
*/
read_nextch( file([],P), file([],[C|P]), C ) :-
    !,
    get0( C ).

read_nextch( file([C|T],P), file(T,[C|P]), C ) :-
    !.

read_nextch( string([C|Rest],P), string(Rest,[C|P]), C ) :-
    !.

read_nextch( string([],P), string(ended,P), C ) :-
    !,
    is_end_of_file_char( C ).


/*  putback( C0+, Source0+, Source- ):
        Source represents Source after having character C put back onto
        it.
*/
putback( C, string(L,[_|Pn]), string([C|L],Pn) ) :- !.

putback( C, file(L,[_|Pn]), file([C|L],Pn) ) :- !.


/*
token_contents.
---------------
*/


token_contents( ' (', '(' ) :- !.
token_contents( atom(A), A ) :- !.
token_contents( quoted_atom(A), A ) :- !.
token_contents( string(S), S ) :- !.
token_contents( integer(I), I ) :- !.
token_contents( real(R), R ) :- !.
token_contents( var(V), V ) :- !.
token_contents( X, X ).


/*
Errors.
-------
*/


/*  This is the one that redefines token_error.  */
define_token_error( P ) :-
    Goal =.. [ P, A, B, C ],
    retractall( token_error(_,_,_) ),
    asserta( token_error(A,B,C) :- Goal ).


/*  This is called as soon as an error is detected. After is the
    remaining, unparsed, input, and we save its length to indicate
    where the marker should go.

    The caller should ensure that, following this call, read_token
    fails.
*/
note_token_error( Message, Source ) :-
    (
        Source = file(_,Chars)
    ;
        Source = string(_,Chars)
    ),
    asserta( '$token_error'(Message,Chars) ).


/*  This is called before returning from read_tokens, if an
    error had been detected. It picks up the message and input
    and then calls token_error with them as arguments.
*/
token_error( TokensSoFar ) :-
    '$token_error'( Message, Chars ),
    retractall( '$token_error'(_,_) ),
    strip_end_of_file( Chars, Chars_ ),
    /*  The faulty character may have been end-of-file. Don't want to
        display this, so strip it.
    */
    reverse( Chars_, ActualInput ),
    token_error( Message, ActualInput, TokensSoFar ).


/*  The default token_error.  */
default_token_error( Message, Input, _ ) :-
    nl, write('**'),
    tokenise_display_list( Message ), nl,
    tokenise_display_chars( Input ), nl.


/*  Make sure it is the default.  */
:- define_token_error( default_token_error ).


/*  Simple list- and string- displayers, used by default_token_error.  */
tokenise_display_list( [] ) :- !.

tokenise_display_list( [H|T] ) :-
    write(' '),
    write(H),
    tokenise_display_list(T).


tokenise_display_chars( [] ) :- !.

tokenise_display_chars( [C|T] ) :-
    put(C),
    tokenise_display_chars(T).


strip_end_of_file( [C|T], T ) :-
    is_end_of_file_char(C),
    !.

strip_end_of_file( L, L ).


:- endmodule.
