/*  READ_SENTENCE.PL  */


:- module read_sentence.


:- public
        read_sentence/1,
        read_sentence/2.


/*
SPECIFICATION
-------------

This module defines a predicate for reading a sentence as a list of
atoms. It will be useful when adapting the Richard Head exercise to
make an Eliza.


PUBLIC read_sentence( S- ):
---------------------------

Reads up to the next dot, exclamation mark or question mark (or
end-of-file), and discards the rest of the line. Returns the characters
in between as a list of words (atoms or integers), the final element
being the terminator.

Words are deemed to be separated by spaces or other layout. A word is
either
    A sequence of letters, hyphens or apostrophes;
    A sequence of digits, which will be converted into the corresponding
    integer;
    A string delimited by single quotes, which will be converted to
    a single atom, which includes the quotes;
    Any other single character.


PUBLIC read_sentence( Prompt+, S- ):
------------------------------------

As read_sentence/1, but changes the prompt to Prompt (an atom) during
reading.
*/


/*
IMPLEMENTATION
--------------

This file is derived from the Dec-10 tools file READ_SENT, headed as
follows
    File   : /usr/lib/prolog/read_sent
    Author : R.A.O'Keefe
    Updated: 11 November 1983
    Purpose: to provide a flexible input facility
    Modified for NIP and generalised: Ken Johnson, 24 April 1987

I have modified it to be compatible with my predicate libraries, and to
allow the prompt to be set.
*/


:- needs
    file(chars),
    get_prompt / 1,
    member / 2,
    set_prompt / 1.


/*  read_sentence( Prompt+, Words- ):
        Reads characters up to the next period, which may be several
        lines distant from the start, skips to the end of that line, and
        turns the result into a list of tokens. It can happen that the
        sentence is not well formed, if say there is an unmatched double
        quote. In that case all the characters will still be read, but
        chars_to_words will fail and so read_sentence will fail.
        read_sentence will NOT try to read another sentence.
*/
read_sentence( Prompt, Words ) :-
    (
        nonvar( Prompt )
    ->
        get_prompt( Old ),
        set_prompt( Prompt )
    ;
        true
    ),
    read_until( "!?.", Chars ),
    is_newline_char( NL ),
    read_until( _, [NL], _ ),
    /*  ... skip to end of line  */
    (
        nonvar( Prompt )
    ->
        set_prompt( Old )
    ;
        true
    ),
    chars_to_words( Chars, Words ).


read_sentence( Words ) :-
    read_sentence( _, Words ).


/*  read_until( Delimiters+, Answer- ):
        Reads characters from the current input until a character in the
        Delimiters string is read. The characters are accumulated in the
        Answer string, and include the closing delimiter. The end of the
        file is always a delimiter.
*/
read_until( Delimiters, [Char|Rest] ) :-
    get0( Char ),
    read_until( Char, Delimiters, Rest ).


read_until( Char, Delimiters, [] ) :-
    is_end_of_file_char( EOF ),
    member( Char, [EOF|Delimiters] ),
    !.

read_until( _, Delimiters, Rest ) :-
    read_until( Delimiters, Rest ).


/*  chars_to_words( Chars+, Words- ):
    parses a list of characters (read by read_until) into a list of
    tokens.
*/
chars_to_words( Chars,Words ) :-
    chars_to_words( Words, Chars, [] ).

chars_to_words( [Word|Words], A, B ) :-
    chars_to_word( Word, A, C ),
    !,
    chars_to_words( Words, C, B ).

chars_to_words( [], A, A ).


chars_to_word( Word, [Char|C], B ) :-
    is_layout_char( Char ),
    !,
    chars_to_word( Word, C, B ).

chars_to_word( Word, [Char|C], B ) :-
    is_extended_letter_char( Char ),
    !,
    chars_to_atom( Chars, C, B ),
    chars_to_lowercase( [Char|Chars], Name ),
    name( Word, Name ).

chars_to_word( Word, [Char|C], B ) :-
    is_digit_char( Char, Value ),
    !,
    chars_to_integer( Value, Word, C, B ).

chars_to_word( Word, [Quote|C], B ) :-
    is_single_quote_char( Quote ),
    !,
    chars_to_string( Quote, String, C, B ),
    name( Word, String ).

chars_to_word( Punct, [Char|B], B ) :-
    name( Punct, [Char] ).


/*  is_extended_letter_char( C+ ):
        C is a letter or apostrophe or hyphen, i.e. something which
        can appear in the middle of a word.        
*/
is_extended_letter_char( C ) :-
    ( is_letter_char( C ) ; is_single_quote_char( C ) ; is_minus_char( C ) ),
    !.


/*  chars_to_atom( Tail- ):
        Reads the remaining characters of a word. Case conversion is
        left to another routine. In this application, a word may only
        contain letters but they may be in either case. If you want to
        parse French you will have to decide what to do about accents. I
        suggest putting them after the vowel, and adding a clause
            chars_to_atom([Vowel,Accent|Chars]) -->
                [Vowel],    {accentable_vowel(Vowel)},
                [Accent],   {accent_for(Vowel, Accent)},
                !.
        with the obvious definitions of accentable_vowel and accent_for.
        Note that the Ascii characters ' ` ^ are officially designated
        the "accent acute", "accent grave", and "circumflex". But this
        file was originally written for an English parser and there was
        no problem.
*/
chars_to_atom( [Char|Chars], [Char|C], B ) :-
    is_extended_letter_char(Char),
    !,
    chars_to_atom( Chars, C, B ).

chars_to_atom( [], A, A ).


/*  chars_to_integer( Init+, Final- ):
        Reads the remaining characters of an integer which starts as
        Init. NB: this parser does not know about negative numbers or
        radices other than 10, as it was written for PDP-11 Prolog.
*/
chars_to_integer( Init, Final, [A|C], B ) :-
    is_digit_char( A, Value ),
    !,
    Next is Init*10+Value,
    chars_to_integer( Next, Final, C, B ).

chars_to_integer( Final, Final, A, A ).


/*  chars_to_string( Quote+, String- ):
         Reads the rest of a string which was opened by a Quote
         character. The string is expected to end with a Quote as well.
         If there isn't a matching Quote, the attempt to parse the
         string will FAIL, and thus the whole parse will FAIL. I would
         prefer to give some sort of error message and try to recover
         but that is application dependent. Two adjacent Quotes are
         taken as one, as they are in Prolog itself.
*/
chars_to_string( Quote, [Quote|String], [Quote,Quote|D], B ) :-
    !,
    chars_to_string( Quote, String, D, B ).

chars_to_string( Quote, [], [Quote|B], B ) :-
    !.

chars_to_string( Quote, [Char|String], [Char|C], B ) :-
    !,
    chars_to_string( Quote, String, C, B ).


/*
Trimming blanks.
----------------

Not used by read_sentence; I've left it in from the original source
in case you find it useful.                
*/


/*  trim_blanks( RawInput+, Cleaned- ):
        Removes leading and trailing layout characters from RawInput,
        and replaces internal groups of layout characters by single
        spaces. Thus trim_blanks(<|TAB TAB a SP ^M ^E b ^Z|>, "a b")
        would be true.
*/
trim_blanks( [Char|Chars], Cleaned ) :-
    ( is_layout_char(Char) ; is_end_of_file_char(Char) ),                     
    !,
    trim_blanks(Chars, Cleaned ).

trim_blanks( [Char|Chars], [Char|Cleaned] ) :-
    !,
    trim_blanks_rest_word( Chars, Cleaned ).

trim_blanks( [], [] ).


trim_blanks_rest_word( [Char|Chars], Cleaned ) :-
    is_layout_char( Char ),
    !,
    trim_blanks_next_word( Chars, Cleaned ).

trim_blanks_rest_word( [Char|Chars], [Char|Cleaned] ) :-
    !,
    trim_blanks_rest_word( Chars, Cleaned ).

trim_blanks_rest_word([], []).


trim_blanks_next_word( [Char|Chars], Cleaned ) :-
    ( is_layout_char(Char) ; is_end_of_file_char(Char) ),
    !,
    trim_blanks_next_word( Chars, Cleaned ).

trim_blanks_next_word( [Char|Chars], [Space,Char|Cleaned] ) :-
    !,
    is_space_char( Space ),
    trim_blanks_rest_word( Chars, Cleaned ).

trim_blanks_next_word( [], [] ).


:- endmodule.
