;;; -*- LISP -*-

;.@chapter Parsing text

(in-package :huh)

;;.The central element of parsing in HUH is the transformation of a
;;.sentence into an object and a shorter sentence.  The object and the
;;.remaining text is called an @code{parse-result} and a single
;;.sentence can yield many parse results.  Transformation methods are
;;.either `primitive', `sequenced', or `composite'.  A primitive
;;.method is defined by a `parser macro' and directly yields a set of
;;.interpretations; a `sequenced' method applies a series of methods
;;.in sequence passing each method the interpretations returned by the
;;.method preceding it; and a composite method combines several
;;.methods to yield several results.  For instance, @code{noun-phrase}
;;.is a composite method which calls a variety of different methods
;;.for parsing noun phrases.  One of these methods, for instance
;;.HUMAN-FULL-NAME is a sequential method which calls a series of
;;.primitive methods to yield an interpretation of the starting
;;.fragment of a sentence as anoun phrase.  For instance,
;;.HUMAN-FULL-NAME first applies the primitive method @code{(satisfies
;;.first-names)} which returns an interpretation only if the first
;;.word of a sentence is in the collection @code{first-names}; if
;;.there is such an interpretation, it proceeds to apply the method
;;.@code{(satisfies symbolp)} to the rest of the sentence; it then
;;.combines the results of these two evaluations into an expression
;;.identifying (for ARLOtje) a human with certain first and last
;;.names.
;;.This parser might be implemented thus:
;;.@example
;;.(define-parser (noun-phrase human-full-name)
;;.    ((first-name (satisfies first-names))
;;.     (last-name  (satisfies symbolp)))
;;.  `(create-object
;;.    (member-of 'humans)
;;.    (first-name ',first-name)
;;.    (last-name ',last-name)))
;;.@end example


;;;;.Tracing the parser

;;.The function @code{trace-parser} turns tracing on and off for the
;;.parser; its optional argument specifies a stream to which trace
;;.output should go.  Calling @code{trace-parser} with no arguments
;;.complements tracing; when tracing is turned on with no specified
;;.stream, it simply goes to the standard output.

(defvar *PARSER-TRACE-STREAM* NIL
  "The stream to which the parser outputs trace information.")

(defmacro parser-trace (format-string &rest format-args)
  "Outputs a trace statement to the parser trace stream."
  `(when *PARSER-TRACE-STREAM*
    (format *PARSER-TRACE-STREAM* "~&\;\;\; ")
    (let ((*package* huh::*words*))
      (format *PARSER-TRACE-STREAM* ,format-string ,@format-args))
    (force-output  *PARSER-TRACE-STREAM*)))

(defun trace-parser (&optional (stream (and (not *PARSER-TRACE-STREAM*) *standard-output*)))
  "Turns on parser tracing."
  (if *parser-trace-stream*
      (cond ((eq stream *parser-trace-stream*))
	    (stream (format *parser-trace-stream*
			    "~&\; Switching parser trace from ~S to ~S"
			    stream *parser-trace-stream*)
		    (format stream
			    "~&\; Switching parser trace from ~S to ~S"
			    stream *parser-trace-stream*))
	    (T (format *parser-trace-stream*
		       "~&\; Switching off parser trace (from ~S)"
		       *parser-trace-stream*)))
    (format stream "~&\; Tracing parser on stream ~S" stream))
  (setq *PARSER-TRACE-STREAM* stream))


;;;;.The central loop

;;.The data structures returned by a parser or method is a
;;.@dfn{parse-result} and stores both an object (the result of the
;;.parser or method) and the remainder of the input left to parse.
;;.These are extracted by the functions @code{pr-object} and
;;.@code{pr-yet-to-parse} respectively.  A parse result can be
;;.constructed by the procedure @code{parse-result} which takes a
;;.result, a list of remaining words, and returns a corresponding
;;.description.

(defstruct (parse-result (:conc-name pr-))
  object
  yet-to-parse)

(defun parse-result (object sentence)
  (make-parse-result :object object :yet-to-parse sentence))

(defvar *default-parse-state* 'sentence
  "This is the state which the parser begins from (unless otherwise specified).")

(defvar *default-parse-depth* 2
  "This is the state which the parser begins from (unless otherwise specified).")

(defvar *parser-stack* '()
  "This is the stack of parse methods in progress.")

(defvar *parser-cache* (list '())
  "This is a dynamically maintained cache of parser results.")

(defvar *parser-macros* '()
  "Macros for use within parser definitions.")

;;.The central parser loop is the procedure @code{parser} which takes
;;.a list of words as input and a `starting state'; the state
;;.indicates the `kind' of thing to be extracted from the list of
;;.words.  This initial state defaults from the variable
;;.*default-parse-state* which is initially @code{HUH::SENTENCE}.

;;.Multiple parses are passed up from the parser as lists of parse
;;.results; at each level these are appended together to make one
;;.large list.

;;.The central loop applies each of the methods stored on the
;;.@code{parse-methods} property of its current state; these are
;;.functions which are applied to the input to yield a set of parse
;;.results; the call to @code{parser} appends all the results yielded
;;.by all the methods and returns these as multiple values.

(defun parser (input &optional (wrt *default-parse-state*))
  "Parses a sentence represented as a list of words and punctuation."
  (parser-trace "Trying to get a ~S from:~{ ~S~}" wrt input)
  (let ((results '()))
    (dolist (method (get wrt 'parse-methods))
      (unless (> (count method *parser-stack*) *default-parse-depth*)
	(let ((parses (let ((*parser-cache*
			     ;; This is neccessary so that only top
			     ;; level parse results are cached.
			     ;; Otherwise, a bad interaction with
			     ;; stack limits occurs and causes the
			     ;; system only to cache simpler parses.
			     (if (member method *parser-stack*)
				 (list '()) *parser-cache*))
			    (*parser-stack* (cons method *parser-stack*)))
			(funcall method input))))
	  (unless parses
	    (parser-trace "Method ~S failed to yield anything." method))
	  (when parses
	    (parser-trace "Method ~S yielded:" method)
	    (dolist (parse parses)
	      (parser-trace "     ~S leaving:~{ ~S~}"
			    (pr-object parse) (pr-yet-to-parse parse))))
	  (setq results (append parses results)))))
    (parser-trace "Succeded in getting ~D ~S objects from:~{ ~S~}"
		  (length results) wrt input)
    results))

;;.The variable @code{*parser-stack*} contains a list of all the
;;.methods currently being executed, and no method goes deeper than
;;.three invocations of a given method (this is to avoid certain
;;.infinite recursions).

;;.While @code{parser} executes the central method application loop,
;;.the procedure @code{parse} is usually used for parsing text; it
;;.maintains a `parser cache' which maps segments of sentences into
;;.series of results; this substantially improves the parser's
;;.performance.  The @code{parse} procedure also performs the initial
;;.coercion of strings into word lists; if called on a string, it
;;.transforms the string into a word list and calls itself on the
;;.corresponding word list.

;;..The parser cache is stored on the list @code{*parser-cache*} as a
;;..list of buckets listed by sublists of the input text (as a list of
;;..words) and then each bucket is indexed by methods and returned
;;..(possibly null) lists of results.

(defun parse (input &optional (wrt 'sentence))
  (if (stringp input)
      (parse (read-words-from-string input) wrt)
    (let* ((*parser-cache*
	    (if (sublistp input (car *parser-cache*))
		*parser-cache*
	      (list input)))
	   (fragment (assoc input (cdr *parser-cache*)))
	   (match (and fragment (assoc wrt fragment))))
      (if match (cdr match)
	(let ((parses  (parser input wrt)))
	  (unless (null parses)
	    ;; Create a new bucket for this input.
	    (unless fragment
	      (setq fragment (list input))
	      (push fragment (cdr *parser-cache*)))
	    ;; Add results.
	    (push (cons wrt parses) (cdr fragment)))
	  parses)))))

(defun sublistp (list of-list)
  (loop (cond ((eq list of-list) (return T))
	      ((null of-list) (return NIL))
	      (T (setq of-list (cdr of-list))))))


;;;;.Defining parsers

;;.Parser methods are defined by the @code{define-parser} form which
;;.specifies one mechanism for parsing results of a particular type
;;.(for instance, noun phrases, verb phrases).  The method consists of
;;.a set of `part clauses' all of which indicate other methods for
;;.extracting subobjects from input sentence and a `body' describing
;;.how to combine the parts thus extracted.  The parts clauses are a
;;.sort of binding context in which the body is evaluated.  For
;;.instance, a simple parser for NP/VP sentences might look like:
;;.@example
;;.(define-parser (sentence NP/VP)
;;.    ((noun noun-phrase)
;;.     (verb verb-phrase))
;;.  `(combine-np+vp noun verb))
;;.@end example
;;.The expressions like @code{(noun noun-phrase)} are part specs,
;;.where the left-hand side is a variable to be bound and the
;;.right-hand side indicates how its value is to be computed.
;;.Logically, the part specs are evaluated in order and each
;;.specification yields an object (to which the variable is bound) and
;;.a fragment of the initial sentence which the subsequent part specs
;;.operate upon.  The right hand sides of part specs are either
;;.symbols, which mean `parse an object of this type' or expressions
;;.from a small set of `parser macros' that perform fixed operations.

;;.While the description above assumed that each part spec yielded a
;;.single binding, the right hand sides actually specify any number of
;;.bindings and the body of the parser is executed separately for each
;;.different interpretation.

(defmacro define-parser ((structure method) parts &body body)
  "Defines a parse method for STRUCTURE called METHOD.
PARTS is a list of specs (var spec) where VAR names a variable and
SPEC is either a form or a `structure' for which parsers are defined.
The parse method sequentially binds the VARs to the resulting values
or the parsed structures, stopping (and returning NIL) if any (var spec)
pair fails to produce a result (returns NIL).  Each form is evaluated
in a context where SENTENCE is bound to the current point in the sentence."
  (let ((fcn-name (fsymbol "~S-~S" structure method)))
    `(progn (deffcn ,fcn-name (sentence)
	      (let ((parser-result '()))
		(macrolet (,@*parser-macros*)
		  ,(expand-parse-method-parts parts body)
		  parser-result)))
      (pushnew ',fcn-name (get ',structure 'parse-methods))
      ',fcn-name)))

(defun expand-parse-method-parts (bindings body)
  "Expands a DEFINE-PARSER body."
  (if (null bindings)
      `(push (parse-result (progn ,@body) sentence) parser-result)
    (let* ((top (first bindings))
	   (expression (if (symbolp (cadr top)) `(parse sentence ',(cadr top))
			 (cadr top))))
      `(dolist (choice ,expression)
	(let ((,(car top) (pr-object choice))
	      (sentence (pr-yet-to-parse choice)))
	  ,(expand-parse-method-parts (rest bindings) body))))))

;;..Parser macros are macros whose scope is only the execution of
;;..parser definitions and they return lists of parser results based
;;..on a bound variable @code{sentence} indicating the current point
;;..in the parse.

(defmacro define-parser-macro (name args &body body)
  "Defines a macro for use within parser definitions.
This is just like DEFMACRO but only applies within the body of
DEFINE-PARSER definitions."
  `(let ((pair (assoc ',name *parser-macros*)))
     (if pair (setf (cdr pair) '(,args ,@body))
       (pushnew '(,name ,args ,@body) *parser-macros*))))

;;.Parser definitions take the form
;;.@example
;;.(define-parser (sentence svp)
;;.   ((subject noun-phrase)
;;.    (verb    verb-phrase))
;;.  (list '(SENTENCE ,subject ,verb)))
;;.@end example
;;.which defines a method @code{svp} for parsing inputs as
;;.@code{sentence}s; this proceeds by first trying to extract a
;;.@code{noun-phrase} from its input and then extracting a verb
;;.phrase from whatever is left.


;;;;.Parser macros

;;.Most of writing new parsers involves the specification of right
;;.hand sides based on parser macros of one sort or another.

;;.The parser macro @code{satisfies} always returns either zero or one
;;.interpretations; it returns one if the immediate word in the
;;.sentence satisfies a particular test (including collections) and
;;.zero otherwise.  In the former case, it also removes the
;;.satisfactory word from the sentence being parsed. For instance, we
;;.could define a parser for full human names thus:
;;.@example
;;.(define-parser (noun-phrase full-human-name)
;;.    ((first-name (satisfies first-names))
;;.     (last-name  (satisfies symbolp)))
;;.  `(create-object
;;.    (member-of 'humans)
;;.    (first-name ',first-name)
;;.    (last-name ',last-name)))
;;.@end example
(define-parser-macro satisfies (prop)
  "A parser macro which returns the first word in the sentence if it has PROP and NIL otherwise."
  `(if (satisfies? (first sentence) ',prop)
    (list (parse-result (first sentence) (rest sentence)))
    '()))

;;.The parser macro @code{test} takes a form and evaluates it; if it
;;.returns true, the macro returns a single interpretation; otherwise,
;;.it returns no interpretations.
(define-parser-macro test (form)
  "A parser macro which returns the first word in the sentence if it has PROP and NIL otherwise."
  `(if (not ,form) '() (list (parse-result T sentence))))

;;.The parser macro @code{is-exactly-one-of} returns the immediate
;;.word as an intepretation if it is in the list of arguments to the
;;.macro.  For instance, the parser for parsing noun phrases with
;;.determiners looks like:
;;.@example
;;.(define-parser (noun-phrase construct-reference)
;;.  ((determiner (is-exactly-one-of the a an))
;;.   (adjectives adjectives)
;;.   (noun (interpretation class-interpretation)))
;;.  `(create-object (member-of ',noun) ,@@adjectives))
;;.@end example
;;.recognizing phrases that begin with `the,' `a,' or `an'.
(define-parser-macro is-exactly-one-of (&rest members)
  "A parser macro which returns true if the first word is one of MEMBERS."
  `(if (member (first sentence) '(,@members) :test 'same-word-p)
    (list (parse-result (first sentence) (rest sentence)))
    '()))

;;.The parser macro @code{exactly-matches} returns an interpretation
;;.@code{T} if a sequence of words in the input text exactly matches
;;.its arguments; so @code{(exactly-matches once upon a time)} would
;;.only have an interpretation if the input sentence's first four
;;.words were `once upon a time'.
(define-parser-macro exactly-matches (&rest words)
  "A parser macro which returns true if words occur in sequence at this point in the sentence."
  `(do ((input sentence (cdr input))
	(pattern '(,@words) (cdr pattern)))
       ((or (null pattern) (not (same-word-p (car pattern) (car input))))
	(if (null pattern) (list (parse-result 'T input)) '()))))

;;.The parser macro @code{interpetation} takes a single @code{prop}
;;.argument and gets this slot from the immediate word in the
;;.sentence; each of these values constitutes one intepretation of the slot.
;;.For instance, an `adjectival phrase' is defined as a series of
;;.words with adjective interpratations and would be coded thus:
;;.@example
;;.(define-parser (adjectival-phrase adjective)
;;.  ((adjective (interpretation adjective-interpretation))
;;.   (additional-adjectives adjectival-phrase))
;;.  (cons `(,(car adjective) ',(cadr adjective)) additional-adjectives))
;;.@end example
(define-parser-macro interpretation (prop)
  "A parser macro which PROP  of the first word in the sentence and NIL otherwise."
  (let* ((domain (get-value prop 'makes-sense-for)))
    (if (aj::many-valued-slotp prop)
	`(if (satisfies? (first sentence) ',domain)
	  (mapcar #'(lambda (x) (parse-result x (rest sentence))) (get-value (first sentence) ',prop))
	  '())
      `(if (and (satisfies? (first sentence) ',domain)
	    (not (failurep (get-value (first sentence) ',prop))))
	(list (parse-result (get-value (first sentence) ',prop) (rest sentence)))
	'()))))

;;.The parser macro @code{values-of} takes two arguments ---
;;.@code{thing} and @code{prop} --- and take as interpretations all
;;.the values of @code{prop} of @code{thing}.  They do not transform
;;.the current sentence at all but are `side effect' operators.
(define-parser-macro values-of (thing prop)
  "A parser macro which PROP  of the first word in the sentence and NIL otherwise."
  (let ((domain (get-value prop 'makes-sense-for)))
    `(if (and (satisfies? ,thing ',domain) (get-value ,thing ',prop))
      (if (aj::many-valued-slotp ',prop)
	  (mapcar #'(lambda (x) (parse-result x sentence)) (get-value ,thing ',prop))
	(list (parse-result (get-value ,thing ',prop) sentence)))
      '())))

;;.The parser macro @code{up-till} returns as a single result the
;;.sequence of words up to one of its arguments.  For instance, we
;;.could strip periods off of sentences by the following parser macro:
;;.@example
;;.(define-parser (text->sentences strip-periods)
;;.    ((sentence (up-till |.|))
;;.     (other-sentences text->sentences))
;;.  (cons sentence other-sentences))
;;.@end example
(define-parser-macro up-till (&rest things)
  "A parser macro which returns true if the first word is one of MEMBERS."
  `(let ((loc (position-if #'(lambda (x) (member x '(,@things)
						 :test 'same-word-p))
			   huh::sentence)))
     (if loc (list (parse-result (subseq sentence 0 loc) (subseq sentence (1+ loc))))
       '())))

;;.The parser macro @code{up-till-or-end} returns either the whole
;;.input text or the input text up to one of its arguments.
(define-parser-macro up-till-or-end (&rest things)
  "A parser macro which returns true if the first word is one of MEMBERS."
  `(let ((loc (position-if
	       #'(lambda (x) (member x '(,@things) :test 'same-word-p))
	       huh::sentence)))
     (if loc
	 (list (parse-result (subseq sentence 0 loc) (subseq sentence (1+ loc))))
       (list (parse-result sentence nil)))))

;;.The parser macro @code{the-end-of-the-sentence} only has an
;;.interpretation at the end of the sentence.
(define-parser-macro the-end-of-the-sentence (&rest punctuations)
  `(if (null sentence) (list (parse-result t nil))
    (if (member (car sentence) ',punctuations :test 'same-word-p)
	(list (parse-result t (cdr sentence)))
      '())))

;;.The parser macro @code{the-rest-of-the-sentence} has an
;;.interpretation of the rest of the sentence.
(define-parser-macro the-rest-of-the-sentence ()
  '(list (parse-result sentence nil)))

;;.The parser macro @code{parse-eval} evals a form and returns the
;;.result.
(define-parser-macro parse-eval (form)
  `(list (parse-result ,form sentence)))

;;.The parser macro noise word swallows the immediate word if it is
;;.one of the macro's arguments and otherwise just returns 'none and
;;.continues.
(define-parser-macro noise-word (&rest words)
  "Returns the first word if it is in words and 'NONE otherwise."
  `(if (member (first sentence) ',words :test 'same-word-p)
    (list (parse-result (first sentence) (rest sentence)))
    (list (parse-result 'none sentence))))


;;;;.Table of Parser macros

;;.This is a table of parser macros; in specifying parser macros, the
;;.right hand side of specifications should be either the name of a
;;.parser (like @code{SENTENCE}, @code{NOUN-PHRASE}, etc) or one of
;;.these expressions.  When the description of a parser macro refers
;;.to a number of interpretations, note that this means given that the
;;.specifications above it in a parser may already specify a number of
;;.interpretations and the number of interpretations of the spec is a
;;.multiplier on the interpretations above.

;;.@table @code
;;.@item (satisfies prop)
;;.Has an interpretation of the current word if it satisfies PROP, none otherwise.
;;.@item (test form)
;;.Has an interpretation @code{T} if @code{form} is true, none otherwise.
;;.@item (is-exactly-one-of &rest members)
;;.Has an interpretation of the current word if it is any of MEMBERS, none otherwise.
;;.@item (exactly-matches &rest words)
;;.Has an interpretation of WORDS if the current word and its followers exactly match words.
;;.(e.g. (exactly-matches once upon a time))
;;.@item (interpretation prop)
;;.Has interpretations corresponding to each of the values of PROP of the current word.
;;.(e.g. (interpretation case-frames) has interpretations for each case frame assigned to the current word
;;.on its @code{case-frame} slot.)
;;.@item (value-of thing prop) 
;;.Has interpretations correponding to each of the PROP values of THING (thing is evaluated, PROP is not).
;;.(e.g. (value-of protagonist brothers) has interpretations corresponding to each of protagonist's brothers).
;;.@item (up-till &rest things)
;;.Has interpretations for each ocurrence of one of THINGS in the rest of
;;.the sentence.  Each interpretation is the subsequence between the current word and the THING.
;;.@item (up-till-or-end &rest things)
;;.Like the above, but includes one interpretation for the whole rest of the sentence.
;;.@item (the-end-of-the-sentence &rest punctuations)
;;.Has an interpretation at the end of the sentence or when the current
;;.word is one of PUNCTUATIONS and there are no words following it.
;;.@item (the-rest-of-the-sentence)
;;.Has one interpretation: the rest of the sentence.
;;.@item (parse-eval form)
;;.Has one interpretation, the result of evalling FORM.
;;.@item (noise-word &rest words)
;;.Has one interpretation but will absorb any of WORDS for subsequent 
;;.specifications.
;;.@end table
