%         The Lexical Compiler
%
%   Here we define predicates to work with different lexicons, to define
% the lexical types and to process system commands.  

/*

Changes for abbreviations -- JC, Mon Mar 22 09:39:02 1993

*/

% To load in a lexicons and its associated type declarations.

gram(X) :- typ(X), lex(X), abs(X).

lex(X) :-
  abolish('entry',1),
  abolish('entry',2),
  abolish('entry',3),
  abolish('ideal_sign',2),
  abolish('unknown_word',2),
  assert((unknown_word(_,[]) :- fail)), % fail for unknown words by default
  name(X,Prefix),
  concat(Prefix,".lex",L),
  name(Lex,L),
  loadin(Lex).

typ(X) :-
  abolish('domain',2),
  abolish('rule',3),
  abolish('sign',3),
  name(X,Prefix),
  concat(Prefix,".typ",T),
  name(Types,T),
  loadin(Types).

/*

Notes on doing abbreviations -- JC Tue Mar 23 15:56:10 1993

Because lexical entries are stored as atoms, and because we want to
trap for cases where an abbreviation is also a lexical item, we have
to check not only that each abbreviation is not identical to any
lexical item, but also that it is not a white-space/end of string
delimited substring of some lexical item.  The brute force approach of
checking this is just too expensive if there are a large number of
lexical items.  So, we do a little bit of preprocessing on the
lexicon.  

For each entry, we tokenize its string form. If what we get back is a
single element list then we know that the entry doesn't contain any
word delimiters and we can forget about it.  Otherwise, we store the
tokenized list as the argument to entry_tokenized/1.  We have to do
this before we look at the abbreviations.  

This means that we can check abbreviations and expansions against the
lexicon by entry/3 (successful in case the entry is atomic), or
against the table entry_tokenized/1 after tokenizing the abbreviation
or expansion.  

We store the input form of abbreviation definitions (after checking
for appropriate string forms) under abbrev_def/2,
abbrev_def(Expansion, Def).  

Once we've loaded all abbreviations, we run through the list checking
for 
- duplicate abbreviations with same expansion
- duplicate abbreviations with different expansions

We store abbreviations in a difference list format for speed during
analysis.  This is

abbrev_table(Abb/AbbT, Exp/ExpT)

where Abb/AbbT represent the abbreviation and Exp/ExpT its expansion.

The big hole in all of this is that there is no guarantee that the
notion of tokenization used here is the same as that used by the
Prolog side parser or the LISP side parser.  A common definition would
be useful.  

*/

abs(X) :-
    name(X, L),
    concat(L, ".abs", FNameL),
    name(FName, FNameL),
    prolog_flag(fileerrors, Old, off),
    (see(FName) -> 
	seen, FileThere = true
      ; FileThere =false),
    prolog_flag(fileerrors, _, Old),
    FileThere = true, !,
    retractall(abbrev_def(_, _)),		% JC Fri Mar 19 14:02:24 1993
    retractall(abbrev_table(_, _)),
    tokenize_entries,
    process_abs(FName),
    make_abbreviation_table.
abs(_).

process_abs(FName) :-
    see(FName),
    repeat,
    read(Abb),
    check_abbrev(Abb, Processed),
    (Abb == end_of_file -> 
	true
      ; assertz(Processed), fail),
    !,
    seen.

:- dynamic abbrev_def/2, abbrev_table/2, entry_tokenized/1.

check_abbrev(EOF, EOF) :-
    end_of_file == EOF, 
    !.
check_abbrev(Var, _) :-
    var(Var),
    !,
    write('Variable occurs in abbreviations file'), nl, fail.
check_abbrev(abbrev(Atom, ListofAs), abbrev_def(Atom, SafeAs)) :-
    (atomic(Atom) -> 
	(illegal_char_in_atom(Atom) -> 
	    write('illegal characters in abbreviation expansion'),
	    write(Atom), nl, fail
	  ; true)
      ; write('Nonatomic abbreviation target '), write(Atom), nl, fail),

    check_abbrevs(ListofAs, SafeAs),
    (check_expansion(Atom) -> 
	true
      ; write('Abbreviation expansion "'), 
        write(Atom), write('" has no corresponding lexical entry (ignored)'), nl, fail),
    SafeAs = [_|_].			% At least one abb survived.


check_abbrevs([], []) :- !.
check_abbrevs([Abb|As], Out) :-
    illegal_char_in_atom(Abb), !,
    write('illegal characters in abbreviation'), write(Abb), nl,
    write('(ignored)'),
    check_abbrevs(As, Out).
check_abbrevs([Abb|As], Out) :-
    (check_expansion(Abb), 
    write('Abbreviation "'), write(Abb), write('" also occurs as (a substring of) a lexical entry'),
    write(' (ignored)'), nl
  ; check_for_abbrev_prefix(Abb)), !,
    check_abbrevs(As, Out).
check_abbrevs([Abb|As], [Abb|Out]) :-
    atomic(Abb),
    check_abbrevs(As, Out).

check_for_abbrev_prefix(Abb) :- 
    abbrev_def(_, Abbs),
    member(A1, Abbs),
    string_prefix_blank(Abb, A1),
    !,
    write('Abbreviations "'), write(Abb), write('" and "'),
    write(A1), write('" clash.  '),
    nl,
    write('"'), write(Abb), write('" will be ignored.'),
    nl.

/*

string_prefix_blank(A1, A2)

A1 has A2 as a prefix followed by whitespace or vice versa

*/
string_prefix_blank(A1, A2) :-
    atomic(A1),
    atomic(A2),
    name(A1, A1L),
    name(A2, A2L),
    string_prefix_blank1(A1L, A2L).

string_prefix_blank1([], [Next|_]) :-
    " " = [Next].
string_prefix_blank1([Next|_], []) :-
    " " = [Next].
string_prefix_blank1([E|R1], [E|R2]) :-
    string_prefix_blank1(R1, R2).

/*

An abbreviation must expand either to a known word, or to a whitespace
delimited substring of a known word.

*/

check_expansion(Abb) :-
    current_predicate(entry, entry(_, _, _)), % To avoid errors from
    	                                        % calling undefined pred.
    entry(Abb, _, _), !.
check_expansion(Abb) :-
    tokenize2dl(Abb, AbbL/[]),
    match_expansion(AbbL).

match_expansion(List) :-
    entry_tokenized(L1),
    sublist(List, L1), !.

sublist(Sub, List) :-
    concat(_, Post, List),
    concat(Sub, _, Post).


illegal_char_in_atom(Atom) :-
    name(Atom, List),
    lisp_bad_chars(BCs),
    member(C, BCs),
    member(C, List), !.

lisp_bad_chars(BCs) :-
    name(';"\', BCs).


make_abbreviation_table :-
    retractall(abbrev_table(_, _)),
    fail.
make_abbreviation_table :-
    make_abbreviation_table1(Abb, Target),
    assertz(abbrev_table(Abb, Target)),    
    fail.
make_abbreviation_table.


/*

add a record to the database for each ``compound word''. 

*/

tokenize_entries :-
    retractall(entry_tokenized(_)),
    fail.
tokenize_entries :-
    entry(X, _, _),
    atomic(X),
    tokenize2dl(X, List/[]),
    \+ List = [X],	% got back what we started with
    \+ entry_tokenized(List),
    assertz(entry_tokenized(List)),
    fail.
tokenize_entries.




make_abbreviation_table1(AbbL/ATail, TargetL/Tail) :-
    abbrev_def(Target, Abbs),
    tokenize2dl(Target, TargetL/Tail),
    member(Abb, Abbs),
    tokenize2dl(Abb, AbbL/ATail),
    \+ \+ (check_duplicates(Target, Abb, AbbL/ATail, TargetL/Tail)).

check_duplicates(Target, Abb, AbbL/[], TargetL/[]) :-
    abbrev_table(AbbL/[], Target1L/[]),
    Target1L = TargetL,
    !,
    write('Duplicate abbreviation "'), 
    write(Abb), write('" for "'), 
    write(Target), write('" (ignored)'), nl, fail.
check_duplicates(Target, Abb, AbbL/[], _) :-
    abbrev_table(AbbL/[], _),
    !,
    write('Error: abbreviation "'),
    write(Abb),
    write('" has multiple expansions.'), nl,
    write('The expansion "'), write(Target),
    write('" (ignored).'), nl,
    fail.
check_duplicates(_, _, _, _).

    


tokenize2dl(Atom, H/T) :-
    name(Atom, [Char|L]),
    tokenize2dl([Char], L, H, T).

tokenize2dl([], [], T, T) :- !.
tokenize2dl(Cs, [], [H|T], T) :-
    !,
    reverse(Cs, CsR),
    name(H, CsR).
tokenize2dl(Cs, [Blank|L], [H|Rest], T) :-
    " " = [Blank], !,
    reverse(Cs, CsR),
    name(H, CsR), 
    tokenize2dl([], L, Rest, T).
tokenize2dl([X], L, [H|Rest], T) :-
    \+ lowercase(X, _),
    \+ member(X, "0123456789/-'"),
    name(H, X),
    tokenize2dl(L, Rest, T).
tokenize2dl(Cs, [C|L], Rest, T) :-
    tokenize2dl([C|Cs], L, Rest, T).


is_list_of_atoms([]).
is_list_of_atoms([H|T]) :-
    atomic(H),
    is_list_of_atoms(T).

/*

Tue Mar 23 11:45:31 1993 JC

filter_abbreviations(Sentence, UnAbbd)

UnAbbd is just like Sentence except that defined abbreviations in
Sentence are replaced by their expansions.

*/

filter_abbreviations([], []) :- 
    !.
filter_abbreviations(In, Exp) :-
    abbrev_table(In/Tail, Exp/ETail),
    !,
    (flag(noisy_abb_expansion) -> 
        concat(Head, Tail, In),
	write('Replacing abbreviation '), 
	write(Head), write(' with '),
	\+ (\+ (ETail = [], write(Exp))),
	nl
      ; true),
    filter_abbreviations(Tail, ETail).
filter_abbreviations([In|Ins], [In|Outs]) :-
    filter_abbreviations(Ins, Outs).



loadin(File) :-
  see(File) ->
    repeat, read(X), process_command(X), seen;
    error('Cannot open grammar file: ', File).
 
process_command(end_of_file) :- !.
process_command(C) :- command(C), !, fail.

%  Now, how do we interpret the different commands that may appear in 
%  type declarations and lexical entries.  For most commands, we
%  recursively "convert" them into a term representation and assert
%  the representation into the database for future access (usually
%  access through the "@" operator)

command( (Name has Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    feature_name(Name, FName),   
    assertz( sign(Name, [FName, Matrix], Constraints) );
    error('Has', Name).

% convert:  (name type definition, conditions) to
%           sign(name,definition, conditions)

command( (Name type Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    assertz( sign(Name, Matrix, Constraints) );
    error('Type', Name).

command( (Name rule Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    assertz( rule(Name, Matrix, Constraints) );
    error('Rule', Name).

command( (Name domain Structure) ) :-
  convert(Structure, List, []) ->
    assertz( domain(Name, List) );
    error('Domain', Name).

% entry - for improved efficiency, include Word as an argument

command( (+Name entry Structure) ) :-	% entries with empty phonology
  (convert(Structure, Matrix, Constraints),
  path(Matrix, phon, [])) ->
    assertz( entry(+Name, Matrix, Constraints) )
  ; error('Entry', Structure).

/*

There is a discrepancy between the distributed version
~ugg/HPSG-prolog/Distrib/lex.pl and /sx/HPSG/PL/lex.pl.  The following
occurs in the former file.

command( (Word entry Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    ( flag(lisp) -> concat_atom(['"',Word,'"'], SWord); SWord=[Word] ),  % Rogers Specific
    path(Matrix, phon, SWord),
    assertz( entry(Word, Matrix, Constraints) );
    error('Entry', Structure).


*/

command( (Word entry Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    ( flag(lisp) -> concat_atom(['"',Word,'"'], SWord); compound(Word,SWord) ),
    path(Matrix, phon, SWord),
    assertz( entry(Word, Matrix, Constraints) );
    error('Entry', Structure).

command( (entry Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    path(Matrix, phon, [Word]),
    assertz( entry(Word, Matrix, Constraints) );
    error('Entry', Structure).

command( (unknown_word Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    assertz( unknown_word(Matrix, Constraints) );
    error('Unknown Word', Structure).

command( (ideal Structure) ) :-
  convert(Structure, Matrix, Constraints) ->
    assertz(ideal_sign(Matrix, Constraints));
    error('Ideal', Structure).

command( (Test -> Command) ) :- Test -> command(Command).

command( (Command1 ; Command2) ) :- ( command(Command1); command(Command2)), !.

command( (Command1 , Command2) ) :- ( command(Command1), command(Command2)), !.

% If the user has defined an IDEAL sign, use it!
% Something can't be a start symbol more than once though.

% Thu Mar 11 13:58:59 1993 JC
% Before  _Constraints was [].

complete(Sign) :- clause(ideal_sign(_,_),_), !, ideal_sign(Sign,_Constraints).
complete(_).
 
% Convert an atom into a list of words.

compound(Atom,List) :-
  name(Atom,CharList),
  compound_aux(CharList,List).

compound_aux([], []).

compound_aux(CharList, [Word|OtherWords]) :-
  split(32, CharList, WordL, RestList), !,
  name(Word, WordL),
  compound_aux(RestList, OtherWords).
  
/*

Mon Mar 29 15:09:48 1993 JC Squashed singleton var in first clause

*/
split(_X, [], [], []).

split(X, [X|Rest], [], Rest).

split(X, [D|List], [D|Result], Rest) :-
   split(X, List, Result, Rest).

% In the case where a suffix is used to distinguish similar structures,
% the actual feature name does not contain the suffix.

feature_name(Name-_, Name) :- !.
feature_name(Name, Name).

convert(Var, Var, []) :- var(Var), !.

convert(A, A, []) :- atomic(A), !.

/*

Mon Mar 29 14:17:21 1993 JC I really doubt that the following clause
should be in here, as it overrides or will get overridden by any user
definition of var/1.  The clause doesn't appear in the distributed
version.

*/

convert(var(N), Result, []) :-  flag(lisp) -> Result = var(N) ; Result = N.

convert(!Feature, [Feature, _], []) :- !. 

/*

Thu Mar 11 14:00:52 1993 JC

The following clause used to read

convert(@Feature, Matrix, Cs) :- !,
  ( convert(Feature, F, []), sign(F,Matrix,Cs) -> true;
      nl, write('Expansion failure for @'), write(Feature), nl ).

i.e. evaluating what convert/2 returns wrt the database.  This
behaviour is *wrong* in the case of variables.

Rewrote adding new routine convert_parameterized_template/3.  

*/

convert(@Template, Matrix, Cs) :- !,
    ( convert_parameterized_template(Template, Matrix, Cs) -> true;
      nl, write('Expansion failure for @'), write(Template), nl ).


convert(Set set Elements, List, []) :- !,
  ( set2list(Set, Elements, List) -> true;
      nl, write('Expansion failure for set of '), write(Set), nl ).

%convert(L\T, NewL\T, Cs) :- !, convertlist(L, NewL, Cs).

/*

Mon Mar 29 14:20:49 1993 JC

There is a minor discrepancy between the distributed version and this
in the following clause.  The line 

    concat(Cs2, Cs1, Cs)

reads

    concat(Cs1, Cs2, Cs)

in the distributed version.  I.e. the constraints appear reordered.
This was, I guess, a minor efficiency optimization by someone.

*/

convert((Expr with Path = Value),AVM,Cs) :- !,
  ( convert(Expr,AVM,Cs1), convert(Value,Matrix,Cs2), path(AVM,Path,Matrix) ->
    concat(Cs2, Cs1, Cs);
      nl, write('Expansion failure for '), write(Expr with Path = Value), nl ).

convert((Ftr1 & Ftr2),Ftr,Cs) :- !,
  ( convert(Ftr1,AVM1,Cs1), convert(Ftr2,AVM2,Cs2), unify(AVM1,AVM2,Ftr) ->
    concat(Cs1,Cs2,Cs);
      nl, write('Expansion failure for '), write(Ftr1 & Ftr2), nl ).

% function calls are implemented as constraints on free variables
% These are now all entered manually.  We could automatically do this
% if we wanted to.

convert(order(X), Y, [order(X,Y)]) :- !.
convert(select(X), Y, [select(X1,Y)]) :- !, convert(X,X1,[]).
convert(select1(X), Y, [select1(X1,Y)]) :- !, convert(X,X1,[]).
convert(append(X,Y), Z, [concat(X,Y,Z)]) :- !.
convert(combine(X,Y,Z), R, [combine(X,Y,Z,R)]) :- !.
convert(collect(X,Y,Z), R, [collect(X,Y,Z,R)]) :- !. % Rogers compatibility
convert(collect(X,Y,Z,W), R, [collect(X,Y,Z,W,R)]) :- !.
convert(nfp(X,Y,W,Z,V), U, [nfp(X,Y,W,Z,V,U)]) :- !.

convert(Struct, Matrix, Cs) :- 
  Struct =.. [F|Args], !, convertlist(Args,NewArgs,Cs), Matrix =.. [F|NewArgs].

convertlist(V,V,[]) :- var(V), !.
convertlist([],[],[]).
convertlist([T|Rest], [T1|Rest1], Cs) :- 
  convert(T, T1, Cs1), 
  convertlist(Rest, Rest1, Cs2),
  concat(Cs1, Cs2, Cs).

/*

Thu Mar 18 14:44:42 1993 JC

new routine convert_parameterized_template/3.  Note that convertlist/3
is not usable here, because that will produce the wrong results in
case of a parameterized template that unifies some of its head
variables.

*/

convert_parameterized_template(Name, _Matrix, _Cs) :-
    var(Name), !,
    fail.

convert_parameterized_template(Name, Matrix, Cs) :-
    atomic(Name), !,
    sign(Name, Matrix, Cs).
convert_parameterized_template(Name, Matrix, Cs) :-
    functor(Name, F, N),
    functor(DBName, F, N),
    Name =.. [_|Args],
    DBName =.. [_|DBArgs],
    sign(DBName, Matrix, DBCs),
    convert_parameterized_template_list(Args, NewList, Cs1), !,
    (DBArgs = NewList -> true;
      error('parameterized template arguments', F/N)),
    concat(Cs1, DBCs, Cs).

convert_parameterized_template_list([], [], []) :-
    !.
convert_parameterized_template_list([A|As], [N|Ns], CsOut) :-
    convert(A, N, ACs),
    convert_parameterized_template_list(As, Ns, AsCs),
    concat(ACs, AsCs, CsOut).

unify(Matrix,Matrix,Matrix).   % Could be replaced by a more detailed predicate
                               % when you want to see what's gone wrong...

error(Type, Name) :-
  nl, write(Type), write(' definition failure for '), write(Name), nl.
