;;;==================================================================;
;;; -*- Syntax: Common-Lisp; Package: USER; Base: 10; Mode: LISP -*-
;;;==================================================================;
;;;
;;;               Center for Machine Translation
;;;                 Carnegie-Mellon University
;;;                                                                       
;;;------------------------------------------------------------------;
;;;                                                                       
;;; Copyright (c) 1995
;;; Carnegie Mellon University. All Rights Reserved.                      
;;;                                                                       
;;;------------------------------------------------------------------;
;;;
;;;          File: phrase-recognizer.lisp
;;;  File created: Friday July 17, 1992
;;;        Author: Nicholas Brownlow <ndb@cs.cmu.edu>
;;; Last Modified:  1-May-95 at 16:13
;;;
;;;------------------------------------------------------------------;
;;; Description
;;; 
;;; This code implements a word-grain phrase recognizer as a
;;; deterministic finite-state automaton (trie-structured).

(in-package :user)


;;;==================================================================;

;;; String Processing Functions

(defvar *word-array* (make-array 10 :element-type 'string
				   :adjustable t
				   :fill-pointer 0)
  "Array to store separate words from strings")

(defun string-to-words (source &optional (dest-array *word-array*))
  "Break SOURCE string into separate words, storing each word string
in DEST-ARRAY.  Return number of words, DEST-ARRAY."  
  (let (char
	(wstart nil)
	(stop (length source))
	(state :skip-space))
    (setf (fill-pointer dest-array) 0)
    (do ((scnt 0 (1+ scnt)))
	((eq scnt stop)
	 (progn
	   (if wstart
	       (vector-push-extend (subseq source wstart scnt) dest-array))
	   (values (fill-pointer dest-array) dest-array)))
      (setq char (schar source scnt))
      (loop 
       (case state
	 ;; If space, skip; otherwise continue processing char
	 (:skip-space
	  (cond ((eql char #\space)
		 (return))
		(t
		 (setq wstart scnt)
		 (setq state :check-space))))
	 ;; If space, prepare to skip rest of whitespace;
	 ;;  otherwise continue processing char
	 (:check-space
	  (cond ((eql char #\space)
		 (vector-push-extend (subseq source wstart scnt) dest-array)
		 (setq wstart nil)
		 (setq state :skip-space)))
	  (return)))))))


;;;==================================================================;

;;; Shrink-wrap for arc field access

(defun phrec-haltp (phrec arc)
  "Return T iff ARC leads to the halt state for PHREC."
  (stm-haltp phrec arc))

(defun phrec-next-state (arc)
  "Return next state for ARC."
  (stm-get-arc-next-state arc))


;;;==================================================================;

;;; Phrase word value functions
;;;
;;; The value at each arc in the trie (i.e. for each word in a phrase)
;;; is a CONS cell whose CAR is the key for the phrase ending at that
;;; word, if any, and whose CDR is a boolean value, T if this word is
;;; the head of a phrase.


(defstruct arcval
  (key nil :type (or null string))
  (headp nil :type symbol)
  (dmk-entries nil))


(defun phrec-key (arc)
  "Returns the key for the phrase ending with the word labelling ARC
or NIL if none."
  (and (stm-get-arc-value arc)
       (arcval-key (stm-get-arc-value arc))))

(defun phrec-set-key (arc key)
  "Encode ARC as at end of phrase."
  (if (stm-get-arc-value arc)
      (setf (arcval-key (stm-get-arc-value arc)) key)
      (stm-set-arc-value arc (make-arcval :key key))))

(defun phrec-headp (arc)
  "Returns T iff ARC is at a phrase head."
  (and (stm-get-arc-value arc)
       (arcval-headp (stm-get-arc-value arc))))

(defun phrec-set-head (arc &optional (value t))
  "Encode ARC as at a phrase head."
  (if (stm-get-arc-value arc)
      (setf (arcval-headp (stm-get-arc-value arc)) value)
      (stm-set-arc-value arc (make-arcval :headp value))))

(defun phrec-dmk (arc)
  "Returns the dmk entries for the phrase ending with the word labelling ARC
or NIL if none."
  (and (stm-get-arc-value arc)
       (arcval-dmk-entries (stm-get-arc-value arc))))

(defun phrec-set-dmk (arc dmks)
  "Encodes ARC as having the given dmk entries"
  (if (stm-get-arc-value arc)
      (setf (arcval-dmk-entries (stm-get-arc-value arc)) dmks)
      (stm-set-arc-value arc (make-arcval :dmk-entries dmks))))

(defun phrec-push-dmk (arc dmk)
  "Adds DMK to ARC"
  (if (stm-get-arc-value arc)
      (push dmk (arcval-dmk-entries (stm-get-arc-value arc)))
      (stm-set-arc-value arc (make-arcval :dmk-entries (list dmk)))))


;;;==================================================================;

;;; Phrase recognizer functions

(defconstant *phrec-words* 10000 "Expected number of distinct words in phrases")

(defun make-phrec (&key (size *phrec-words*))
  (make-static-trie-machine :size size :test #'equal))

(defun shrink-phrec (phrec)
  (shrink-stm phrec))

(defun phrec-static (phrec)
  (stm-static phrec))

(defun phrec-insert (phrec phrase key dmk &optional (head nil))
  "Insert PHRASE with KEY into PHREC.  PHRASE can be a sequence or a string.
Mark head word indexed by HEAD, which can be either an integer or one
of the keywords :NONE or :LAST."
  (let ((length nil)
	words
	(count 1)
	(state (stm-start phrec))
	arc)
    (cond ((stringp phrase)
	   (multiple-value-setq (length words) (string-to-words phrase)))
	  ;((sequencep phrase)
           ((listp phrase)
	   (setq words phrase
		 length (length words))))
    (unless length
      (return-from phrec-insert))
    (map nil #'(lambda (word)
		 (setq arc (stm-get-arc-new phrec state word (= count length)))
		 (when (= count length)
		   (phrec-set-key arc key)
		   (phrec-push-dmk arc dmk)
		   (when (eq head :LAST)
		     (phrec-set-head arc)))
		 (when (and (integerp head) (= count head))
		   (phrec-set-head arc))
		 (incf count)
		 (setq state (stm-get-arc-next-state arc)))
	 words))
  t)

(defun phrec-query (phrec phrase &optional (head nil))
  "Query PHREC for PHRASE with KEY.  If HEAD given, check head marking.
PHRASE can be a sequence or a string.  HEAD can be either an integer
or one of the keywords :NONE or :LAST."
  (let ((length nil)
	words
	(count 1)
	(state (stm-start phrec))
	arc
	(end-ok t)			; Are we recognizing the phrase ok?
	(head-ok t)			; Are we finding the head ok?
	(result nil))
    (cond ((stringp phrase)
	   (multiple-value-setq (length words) (string-to-words phrase)))
	  ;((sequencep phrase)
           ((listp phrase)
	   (setq words phrase
		 length (length words))))
    (unless length
      (return-from phrec-query))
    (map nil #'(lambda (word)
		 (when (= state (stm-halt phrec)) ; We shouldn't be in the halting state here
		   (setq end-ok nil))
		 (when end-ok
		   (cond ((setq arc (stm-get-arc phrec state word))
			  (when (= count length)
			    (setq result (stm-arc-value arc))
			    (setq end-ok (phrec-key arc))
			    (when (eq head :LAST)
			      (setq head-ok (phrec-headp arc))))
			  (when (and (integerp head) (= count head))
			    (setq head-ok (phrec-headp arc)))
			  (incf count)
			  (setq state (stm-get-arc-next-state arc)))
			 (t
			  (setq end-ok nil)))))
	 words)
    result))

(defun phrec-step (phrec state word)
  "Returns the arc for STATE and WORD in PHREC."
  (stm-get-arc phrec state word))


