;;;==================================================================;
;;; -*- Syntax: Common-Lisp; Package: USER; Base: 10; Mode: LISP -*-
;;;==================================================================;
;;;
;;;               Center for Machine Translation
;;;                 Carnegie-Mellon University
;;;                                                                       
;;;------------------------------------------------------------------;
;;;                                                                       
;;; Copyright (c) 1995
;;; Carnegie Mellon University. All Rights Reserved.                      
;;;                                                                       
;;;------------------------------------------------------------------;
;;;
;;;          File: reader.lisp
;;;  File created: 6-May-92 by amf
;;;    Maintainer: Eric Nyberg [ehn@cs.cmu.edu]
;;;
;;; Last Modified:                   30-May-96, 12:08-Jun-95 at 18:52
;;;   
;;;------------------------------------------------------------------;
;;; File Description:
;;;
;;; Reader and tokenizer for LR parser.
;;;                    
;;; Based on old files parser-fns.lisp, misc.lisp, misc-patches.lisp,
;;; word-based.lisp


;;;==================================================================;
;;; Change Log
;;;
;;;  6-May-92 by amf: created
;;; 29-May-92 by amf: changed so that only last period is tokenized to
;;;                   *period*
;;; 20-Nov-95 by EHN: Updated version of tokenizer to better preserve
;;;                   information about original character string, for
;;;                   easier disambiguator implementation and maintenance
;;;                   (See bottom of file, changes localized there wherever
;;;                   possible).
;;;
;;;==================================================================;
;;; Package Statements

(in-package :user)


;;;==================================================================;

;;; Global Variables

;;; externs
(proclaim '(special *parse-value* *eng-irec*))

;;; Everything between these pairs is case protected.
(defvar *case-fences*)
(setq *case-fences*
      '((*DQ* . *DQ*)
	(&LSQUO@ . &RSQUO@)
	(&LDQUO@ . &RDQUO@)
	;;({QUOTE} . {/QUOTE})
	({PAGENO} . {/PAGENO})
	({CHEMFORM} . {/CHEMFORM})
	({LABEL} . {/LABEL})		
	({CALLOUT} . {/CALLOUT})
	({REVNO} . {/REVNO})
	({FORMNO} . {/FORMNO})
	({BEGIN} . {/BEGIN})
	({PFX} . {/PFX})
	({PARTNO} . {/PARTNO})
	({SALESMDL} . {/SALESMDL})
	({PROPNAME} . {/PROPNAME})
	({NONTRANS} . {/NONTRANS})
	({WIRECORP} . {/WIRECORP})
	({WIREOTHER} . {/WIREOTHER})))

(defvar *analyzer-tag-phrases*)
(setq *analyzer-tag-phrases* T)


;;;==================================================================;

;;; Parser Reader Character definition functions and storage

(defstruct (parser-reader-char
	    (:conc-name prchar-))
  "A parser reader rule for a character.
OUT:      list of characters to translate the character to
CONTEXTS: list of PARSER-READER-CONTEXTs to test before applying this rule"
  (out nil :type list)
  (contexts nil :type list))

(defstruct (parser-reader-context
	    (:conc-name prcontext-))
  "The left and right context lists for a character."
  (left nil :type list)
  (right nil :type list))


(defvar *parser-reader-char*
  (make-array 256 :element-type 'list)
  "Array of lists of PARSER-READER-CHAR rules")

(defun clear-parser-reader-char ()
  "Clears the array of PARSER-READER-CHAR rules."
  (setf *parser-reader-char*
	(fill *parser-reader-char* nil)))

(defmacro parser-reader-char (char)
  "Returns the PARSER-READER-CHAR rules for CHAR."
  `(aref *parser-reader-char* (char-int ,char)))


(defun define-parser-reader-context (context)
  "Defines the left and right contexts for a character.  CONTEXT is a list of
elements with '_ marking the character position."
  ;; Validate the CONTEXT definition
  (if (every #'(lambda (elt)
		 (or (symbolp elt) (characterp elt)))
	     context)
      ;; It's kosher: parse the CONTEXT into left and right sides
      (let ((i (position '_ context)))
	(make-parser-reader-context
	 :left (if i (subseq context 0 i) context)
	 :right (and i (subseq context (1+ i)))))
    (warn "Bad Parser Reader Context ~S" context)))

(defun define-parser-reader-char (char out contexts)
  "Defines a PARSER-READER-CHAR rule for CHAR, translating to OUT when all of
the given CONTEXTS match."
  ;; Validate the definition
  (if (and (characterp char)
	   (listp out) (every #'characterp out)
	   (listp contexts))
      ;; It's kosher: build it and add to the structures for CHAR
      (setf (parser-reader-char char)
	    (nconc (parser-reader-char char)
		   (list (make-parser-reader-char
			  :out out
			  :contexts (mapcar #'define-parser-reader-context
					    contexts)))))
    (warn "Bad Parser Reader Char definition ~S ~S ~S" char out contexts)))

(defmacro defprc (char out &rest contexts)
  `(define-parser-reader-char ',char ',out ',contexts))


;;;==================================================================;

;;; Parser Reader Character matching

(defun match-context-element (string i element)
  "Matches context ELEMENT against position I in STRING."
  (let* ((n (length string))
	 (char (and (>= i 0)
		    (< i n)
		    (schar string i))))
    (cond ((eq element '^)		; Matches beginning of string
	   (< i 0))
	  ((eq element '$)		; Matches end of string
	   (>= i n))
	  ((not char)			; Fail if no char at this point
	   nil)
	  ((eq element 'N)		; Matches numeric char
	   (digit-char-p char))
	  ((characterp element)		; Exact character match
	   (char= element char))
	  (t nil))))

(defun match-parser-reader-context (string i prcontext)
  "Matches PRCONTEXT against the context of position I in STRING."
  (and (do ((j (- i (length (prcontext-left prcontext))) (1+ j))
	    (elements (prcontext-left prcontext) (rest elements)))
	   ((endp elements) t)
	 (unless (match-context-element string j (first elements))
	   (return nil)))
       (do ((j (1+ i) (1+ j))
	    (elements (prcontext-right prcontext) (rest elements)))
	   ((endp elements) t)
	 (unless (match-context-element string j (first elements))
	   (return nil)))))

(defun parser-reader (string i)
  "Returns the translation list of characters for the character at position I
in STRING."
  (let* ((char (schar string i))
	 (prchars (parser-reader-char char)))
    (if prchars
	;; Reader definition exists: find one for which all contexts match
	(let ((prchar
	       (find-if #'(lambda (prchar)
			    (every #'(lambda (prcontext)
				       (match-parser-reader-context string i prcontext))
				   (prchar-contexts prchar)))
			prchars)))
	  (and prchar (copy-list (prchar-out prchar))))
      ;; No reader definition: just return a list of CHAR
      (list char))))


;;;==================================================================;

;;; Parser Reader Character definitions

(clear-parser-reader-char)

(defprc #\!  (#\SPACE #\* #\B #\A #\N #\G #\* #\SPACE))
(defprc #\"  (#\SPACE #\* #\D #\Q #\* #\SPACE))
(defprc #\#  (#\SPACE #\* #\H #\A #\S #\H #\* #\SPACE))
(defprc #\$  (#\SPACE #\* #\D #\O #\L #\L #\A #\R #\* #\SPACE))
(defprc #\%  (#\SPACE #\* #\P #\E #\R #\C #\E #\N #\T #\* #\SPACE))
(defprc #\&  (#\SPACE #\&))
(defprc #\'  (#\SPACE #\* #\A #\P #\O #\S #\T #\R #\O #\P #\H #\E #\* #\SPACE))
(defprc #\(  (#\SPACE #\[ #\SPACE))    
(defprc #\)  (#\SPACE #\] #\SPACE))
(defprc #\+  (#\SPACE #\+ #\SPACE))
(defprc #\,  (#\,)
  (N _ N))
(defprc #\,  (#\SPACE #\* #\C #\O #\M #\M #\A #\* #\SPACE))
(defprc #\:  (#\SPACE #\* #\C #\O #\L #\O #\N #\* #\SPACE))
(defprc #\;  (#\@ #\SPACE))    
(defprc #\<  (#\SPACE #\{))
(defprc #\>  (#\} #\SPACE))    
(defprc #\[  (#\SPACE #\[ #\SPACE))
(defprc #\]  (#\SPACE #\] #\SPACE))
(defprc #\newline  (#\SPACE))
(defprc #\tab  (#\SPACE))

;;; (defprc #\-  (#\SPACE #\- #\SPACE))
;;; (defprc #\.  (#\SPACE #\* #\P #\E #\R #\I #\O #\D #\* #\SPACE))
;;; (defprc #\/  (#\SPACE #\* #\S #\L #\A #\S #\H #\*))
;;; (defprc #\=  (#\SPACE #\* #\H #\Y #\P #\H #\E #\N #\* #\SPACE))


;;;==================================================================;

;;; Functions

(defun preprocess-word-string (string &key (tokenize-final-period T))
  (when (string= "" string)
    (setq *parse-value* '(((CAT N)(ROOT BOGUS))))
    (return-from preprocess-word-string NIL))
  (let (first-bag second-bag result
		  (left-index 0))
    ;; Read in all characters, translating where necessary.
    (dotimes (i (length string))
      (setf first-bag (nconc first-bag (parser-reader string i))))
    ;; Turn final period into *period* token
    (if tokenize-final-period
	(setf first-bag (tokenize-final-period first-bag)))
    (setf first-bag (nconc first-bag (list #\Space)))
    ;; Filter out SGML tags 
    (setf first-bag (filter-sgml-tags first-bag))
    ;; Check if the bag is empty now; if so, then return with T
    (when (list-only-contains-spaces first-bag)
      (setq *parse-value* '(((CAT N)(ROOT BOGUS))))
      (return-from preprocess-word-string NIL))
    ;; Read through the list and create the strings.
    (dotimes (i (length first-bag))
      (cond ((char-equal (nth i first-bag) #\Space)
	     (cond ((< left-index i)
		    (setf second-bag
			  (nconc second-bag
				 (bag-string first-bag left-index i)))))
	     (setf left-index (1+ i)))))
    (setq result (append second-bag '($)))
    ;; Do capitalization
    (setq result (capitalize result))
    ;; Insert phrase tags where necessary
    (if *analyzer-tag-phrases*
	(setq result (insert-phrase-tags result)))
    (values result)))

;;;------------------------------------------------------------------;
;;; capitalize gets a list of symbols, the tokenized input with all
;;; original case distinctions preserved. It turns everything into
;;; upper case except symbols between "case fences" on *case-fences*.

(defun  capitalize (symbol-list &optional (fences *case-fences*)) 
  (let ((len (length symbol-list))
	closer
	closer-position
	result)
    (do* ((i 0 (1+ i)))

	 ((>= i len))

      (cond ((and (setq closer
			(cdr (first (member
				     (nth i symbol-list)
				     fences
				     :test #'eq
				     :key #'first))))
		  (setq closer-position
			(position closer symbol-list :test #'eq :start i)))
	     ;; copy protected area
	     (dotimes (j (1+ (- closer-position i)))
	       (push (nth (+ i j) symbol-list) result))
	     (incf i (- closer-position i)))
	    ((symbolp (nth i symbol-list))
	     (push (intern (string-upcase (symbol-name (nth i symbol-list)))) result))
	    (t
	     (push (nth i symbol-list) result))))
    (values (reverse result))))
          				      

(defun idiom-opener-p (thing)
  (let (len name)
    (and (symbolp thing)
	 (setq name (symbol-name thing))
	 (setq len (length name))
	 (> len 6)
	 (string= "{IDIOM}" (substring name (- len 7))))))

(defun nontrans-opener-p (thing)
  (let (len name)
    (and (symbolp thing)
	 (setq name (symbol-name thing))
	 (setq len (length name))
	 (> len 9)
	 (string= "{NONTRANS}" (substring name (- len 10))))))

(defun insert-phrase-tags (input-list)
  (let ((len (length input-list))
	result)
    (do* ((i 0 (1+ i))
	  (phrase-length
	   (find-longest-phrase (nthcdr i input-list))
	   (find-longest-phrase (nthcdr i input-list)))
	  idiom-closer-position
	  nontrans-closer-position)

	 ((>= i len))
      
      (cond
	;; pre-existing {idiom} tags from disambiguator
	((idiom-opener-p (nth i input-list))
	     
	 (setq idiom-closer-position
	       (+ i (position '{/idiom} (nthcdr i input-list))))
	 (dotimes (j (1+ (- idiom-closer-position i)))
	   (push (nth (+ i j) input-list) result))
	 (incf i (- idiom-closer-position i)))

	;; <nontrans> ... </nontrans> sequence
	((nontrans-opener-p (nth i input-list))
	     
	 (setq nontrans-closer-position
	       (+ i (position '{/nontrans} (nthcdr i input-list))))
	 (dotimes (j (1+ (- nontrans-closer-position i)))
	   (push (nth (+ i j) input-list) result))
	 (incf i (- nontrans-closer-position i)))

	;; start of idiom
	(phrase-length
	 (push '{IDIOM} result)
	 (dotimes (j phrase-length)
	   (push (nth (+ i j) input-list) result))
	 (push '{/IDIOM} result)
	 (incf i (1- phrase-length)))
	;; no idiom
	(t
	 (push (nth i input-list) result))))
    (values (reverse result))))

;; find-longest-phrase looks for the longest preferred phrase in *eng-irec*,
;; and returns its length

;; how phrec works:
;; initial state is 0.
;; (call-phrec-step *eng-irec* <state> <word>)
;; returns arc.
;; (phrec-next-state arc) is new state.
;; (phrec-key arc) returns key if this is an accepting state

(defun find-longest-phrase (symbol-list)
  (let ((result-list (find-longest-phrase-rec 1 0 symbol-list)))
    (values (if result-list
		(apply #'max result-list) ; then
 		NIL)))) ; else


;; given a state, word, and remaining symbols,
;; return length of longest phrase starting
;; from state, with next word word, and remaining
;; words chosen by de-inflecting list		       
		       
(defun find-longest-phrase-rec (level state symbol-list)
  (let* ((first-symbol (car symbol-list))
	 (first-word (if (symbolp first-symbol)
			 (symbol-name first-symbol)))
	 current-word-list arc-list result-list)
    (unless first-word
      (return-from find-longest-phrase-rec NIL))
    (setq current-word-list
	  (word-roots
	   (string-downcase
	    (symbol-name
	     (car symbol-list)))))

    ;; get arc for every word on word list
    (setq arc-list
	  (remove-if #'null
		     (mapcar #'(lambda (word)
				 (phrec-step *eng-irec* state word))
			     current-word-list)))
    ;; if any of these is a final state, put it on the result list
    (dolist (arc arc-list)
      (when (phrec-key arc)
	(push level result-list)
	(return)))
    ;; for every arc, continue
    (dolist (arc arc-list)
      (setq result-list
	    (append result-list
		    (find-longest-phrase-rec (1+ level)
					    (phrec-next-state arc)
					    (cdr symbol-list)))))
    ;; return biggest length, or NIL
    (values result-list)))

		       
;;; preprocess-root-string is called by the dmk-synlex-loader
;;; to adjust DMK root strings to mirror the parser input.
;;; keep this in synch with preprocess-word-string above.

(defun preprocess-root-string (string)
  (let (result len first-bag)
    ;; Read in all characters, translating where necessary.
    (dotimes (i (length string))
      (setf first-bag (nconc first-bag (parser-reader string i))))
    (setf first-bag (nconc first-bag (list #\Space)))
    ;; Filter out SGML tags
    (setf first-bag (filter-sgml-tags first-bag))
    ;; Delete multiple contiguous spaces
    (setq first-bag (delete-multiple-spaces first-bag))
    ;; Check if the bag is empty now; if so, then return with T
    (when (list-only-contains-spaces first-bag)
      (warn "[dmk-synlex-loader] preprocess-root-string: root string ~S now empty" string)
      (return-from preprocess-root-string ""))
    (setq len (length first-bag))
    (setq result (make-string len))
    (dotimes (i len)
      (setf (char result i) (nth i first-bag)))
    (setq result (string-trim '(#\space) result))
    (values result)))

(defun delete-multiple-spaces (list)
  (let (result space-seen)
    (dolist (char list)
      (cond ((char= char #\space)
	     (cond (space-seen) ; do nothing
		   (t (push char result)
		      (setq space-seen t))))
	    (t ; char is not seen
	     (push char result)
	     (setq space-seen nil))))
    (values (nreverse result))))

(defun list-only-contains-spaces (charlist)
  (dolist (char charlist)
    (unless (char= char #\space) ; we've found a non-space
      (return-from list-only-contains-spaces NIL)))
  (values t)) ; we've been through entire list, there are only spaces

(defun tokenize-final-period (charlist)
  (let* ((len (length charlist))
	 final-period-position)

    (do* ((i (1- len) (1- i)))
	 
	 ((= i 0)
	  (setq final-period-position i)) ; stop if at first char; return i

      (cond ((char= (nth i charlist) #\})
	     ;; skip over tag
	     (setq i (or (position #\{ charlist :from-end t :end i)
			 0)))
	    ;; otherwise, it's an input text char
	    ((char= (nth i charlist) #\])
	     ;; skip over closing paren
	     (setq i (1- i)))
	    ((char= (nth i charlist) #\")
	     ;; skip over closing quote
	     (setq i (1- i)))
	    ((char= (nth i charlist) #\@)
	     ;; skip over char ent
	     (setq i (or (position #\& charlist :from-end t :end i)
			 0)))

	    ;; otherwise, it's an input text char
	    ((not (char-equal (nth i charlist) #\space))
	     (setq final-period-position i)
	     (return))))

    (cond ((char-equal (nth final-period-position charlist) #\.) ; found final period
	   ;; chop off final period
	   (nconc (subseq charlist 0 final-period-position)
		  (list #\SPACE #\* #\P #\E #\R #\I #\O #\D #\*)
		  (subseq charlist  (1+ final-period-position))))
		  
	  (t
	   charlist))))


(defun bag-string (charbag left right)
  "Slightly slower, but handles numbers better."
  (let ((result (make-string (- right left)))
	(i 0))
    (loop
      (when (= left right)
	(return (list (make-token-symbol result))))
      (setf (char result i) (nth left charbag))
      (incf left)
      (incf i))))

;;;------------------------------------------------------------------;
;;; make-token-symbol needs to treat numbers right. 


(defun make-token-symbol (string)
  (intern string))

;;; Alex, thou didst not comment out the old defun!
;;; (defun make-token-symbol (string)
;;;   (cond ((string-is-number string)
;;;          (read-from-string string))
;;;         (t
;;;          (intern string))))


;;; ----------------------------------------------------------------------
;;; 20-Nov-95 by EHN -- Stuff added for rewrite ==> Analyzer 1.8.1
;;; ----------------------------------------------------------------------

;(defvar *analyzer-tag-phrases*) ; set in how-many-parses-quiet
;                                ; when non-NIL, don't insert IDIOM tags


(defvar *chrinp*)  ; Input string

(defvar *chrtrx*)  ; Array of pointers, same length as input string,
                   ; holds character translations from PARSER-READ

(defvar *partok*)  ; list of parser-ready tokens

(defvar *tokval*)  ; all data about tokens in *partok*


(defun load-tokenizer-input (s)
  (let ((input-length (length s)))
    (setq *chrinp* s)
    ;; Is this too expensive to do per string?
    (setq *chrtrx* (make-array input-length))
    (dotimes (i input-length input-length)
      (setf (aref *chrtrx* i)
	    ;; Used to strip out extra #\Space from the PARSER-READER
	    ;; return val, but screwed up SGML tag handling.
	    (or (parser-reader s i)
		(list (char s i)))))))

;; ----------------------------------------------------------------------
;; Debugging Stuff.

(defun show-chrtrx ()
  (dotimes (i (length *chrtrx*) nil)
    (format t "~%~3s: ~s ==> ~s"
	    i (char *chrinp* i) (aref *chrtrx* i))))

#| moved to test-sentence.lisp.
(defun tokenize (s)
  (let (value tokens temp)
    (load-tokenizer-input s)
    ;;(show-chrtrx)
    (setq value (find-tokens s))
    (setq temp (capitalize
		(remove-if #'null
			   (mapcar #'(lambda (x)
				       (second (assoc 'partok x)))
				   value))))
    (multiple-value-setq (tokens *tokval*)
      (if *analyzer-tag-phrases*
	  (adjust-for-idiom-tags temp value)
	(values temp value)))
    (if tokens
	(setq *partok*
	      (append tokens (list '$)))
      (setq *partok* nil))))|#

(defun compare-tokenizers (sents)
  (let ((ok? t)(count 0))
    (dolist (s sents ok?)
      (let ((old (preprocess-word-string s))
	    (new (tokenize s)) x y)
	(cond ((equal old new)
	       T)
	      (t
	       ;;(print s)
	       (warn "[~s] Old: ~s~%[~s] New: ~s" count old count new)
	       (loop
		(when (and (null old)(null new))
		  (return))
		 (if old (setq x (pop old))(setq x NIL))
		 (if new (setq y (pop new))(setq y NIL))
		 (format t "~%~15s ~15s" x y))
	       (setq ok? nil)
	       nil))
	(incf count)))))

(defun compare-tokenizers-file (file)
  (let ((ok? t)(count 0) s)
    (with-open-file (in file :direction :input)
      (loop
       (when (zerop (mod count 100))
	 (print count))
       (setq s (read in nil in))
       (when (eq s in) (return ok?))
       (let ((old (preprocess-word-string s))
	     (new (tokenize s)))
	 (cond ((equal old new)
		T)
	       (t
		(print s)
		(warn "[~s] Old: ~s~%[~s] New: ~s" count old count new)
		(setq ok? nil)
		nil))
	 (incf count))))))
	

;; ----------------------------------------------------------------------
;; Stuff from TOKENIZE.LISP.

(defvar *start-contexts*)

(setq *start-contexts*
      ;; Previous char . Current char contexts;
      ;; used to decide left edges of tokens.
      '(
	(NULL . NONWHITE-CHAR-P)
        (WHITESPACE-P . NONWHITE-CHAR-P)
	(:ANY . #\$)
	(#\$ . NONWHITE-CHAR-P)
	(:ANY . #\()
	(#\( . NONWHITE-CHAR-P)	
	(:ANY . #\:)
	(#\: . NONWHITE-CHAR-P)
	(:ANY . #\%)
	(#\% . NONWHITE-CHAR-P)
	(:ANY . #\!)
	(:ANY . #\+)
	(#\+ . NONWHITE-CHAR-P)
	(#\! . NONWHITE-CHAR-P)
	(:ANY . #\")
	(#\" . NONWHITE-CHAR-P)				
	(:ANY . #\))
	(#\' . NONWHITE-CHAR-P)
	(:ANY . #\')
	(#\) . NONWHITE-CHAR-P)	
	(:ANY . #\&)
	(:ANY . #\<)
	(#\> . NONWHITE-CHAR-P)
	(#\; . NONWHITE-CHAR-P)
	(NONDIGIT-P . #\,)
	(#\, . NONWHITE-CHAR-P)
	)
      )

(defvar *special-start-contexts*)
(setq  *special-start-contexts*
       ;; current, next
       '(
	 (#\, . NONDIGIT-P)))


(defvar *end-contexts*)

(setq *end-contexts*
        
      ;; Token start char, Token end char, exclusive/inclusive;
      ;; used to detect right edges of tokens. 

      '(;; Include the boundary character in the token
	(#\<  #\> :inclusive) 
	(#\&  #\; :inclusive)

	;; Don't include the boundary character in the token
	(#\,  NONDIGIT-P :exclusive)
	(#\( :ANY :exclusive)
	(#\) :ANY :exclusive)	
	(#\' :ANY :exclusive)
	(#\: :ANY :exclusive)
	(#\% :ANY :exclusive)
	(#\! :ANY :exclusive)
	(#\+ :ANY :exclusive)	
	(#\" :ANY :exclusive)
	(#\$ :ANY :exclusive)				
	(NON-NUMBER-OR-SPECIAL-CHAR-P #\, :exclusive)
	(NONSPECIAL-CHAR-P WHITESPACE-P :exclusive)
	(NONSPECIAL-CHAR-P SPECIAL-START-P :exclusive)
	)
      )

(defvar *whitespace-chars*)
(setq *whitespace-chars* '(#\space #\newline #\tab))

;; ----------------------------------------------------------------------
;; CHARACTER PREDICATES

(defun whitespace-p (char)
  (member char *whitespace-chars* :test #'equal))

(defun nonwhite-char-p (char)
  (not (or (whitespace-p char)
	   (null char))))

(defun nonspecial-char-p (char)
  (not (or (whitespace-p char)
	   (null char)
	   (member char '(#\< #\& #\( #\) #\' #\: #\% #\! #\+ #\") :test #'equal))))

(defun non-number-or-special-char-p (char)
  (not (or (whitespace-p char)
	   (digit-char-p char)
	   (null char)
	   (member char '(#\< #\& #\( #\) #\' #\: #\% #\! #\+ #\") :test #'equal))))

(defun special-start-p (char)
  (member char '(#\< #\& #\( #\) #\' #\: #\% #\! #\+ #\") :test #'equal))

(defun PI-start-p (string start)
  (and (stringp string)
       (> (length string) (+ start 5))
       (equal "<?CTE" (substring string start (+ start 4)))))

(defun nondigit-p (char)
  (and (characterp char)
       (not (digit-char-p char))))

;; ----------------------------------------------------------------------
;; FUNCTIONS
	
(defun start-context-match-p (previous current)
  (dolist (context *start-contexts* nil)
	  (when (and (context-char-match previous (first context))
		     (context-char-match current  (rest context)))
		(return t))))

(defun special-start-context-match-p (current next)
  (dolist (context *special-start-contexts* nil)
	  (when (and (context-char-match current (first context))
		     (context-char-match next    (rest context)))
		(return t))))

(defun context-char-match (char cchar)
  (cond ((eq :ANY cchar) t)
	((characterp cchar)
	 (equal char cchar))
	((or (symbol-function cchar)
	     (and (listp cchar)
		  (eq 'lambda (first cchar))))
	 (funcall cchar char))
	(t (warn "TOKENIZE: Unknown context char: ~s" cchar)
	   nil)))

(defun find-token-starts (s &rest known-starts) 
  (let (current previous result next (end (length s)))
    (dotimes (i end result) 
	     (setq current (char s i))
	     (if (< i 1)
		 (setq previous nil)
	       (setq previous (char s (1- i))))
	     (if (>= i (1- end))
		 (setq next nil)
	       (setq next (char s (1+ i))))
	     (when (or (member i known-starts)
		       (start-context-match-p previous current)
		       (special-start-context-match-p current next))
		   (setq result (nconc result (list i)))))))

(defun find-end-chars (start-char)
  (let (result)
    (dolist (end *end-contexts* result)
	    (when (context-char-match start-char (first end))
	      (push (rest end) result)))))


(defun trim-for-comma (boundary-list s)
  ;; ((0 2 3) (2 2 1) (4 6 3))
  ;;       ^^^^^ OVERLAP, trim first token when second token starts with ",".
  (let (result current trim start)
    (setq current (pop boundary-list))
    (dolist (next boundary-list (nconc result (list current)))
      ;;(format t "~%CURRENT: ~s NEXT: ~s" current next)
      (cond ((and (>= (second current)
		      (second next))
		  (char= #\, (char s (second next))))
	     ;;(print "TRIMMING!")
	     (setq trim (1- (second next)))
	     (setq start (first current))
	     (setq result (nconc result (list (list start trim (1+ (- trim start)))))))
	    (t (setq result (nconc result (list current)))))
      (setq current next))))
		

(defun find-token-boundaries (s token-starts period)
  (let (result ends endchars end len startchar)
    (dolist (start token-starts (trim-for-comma result s))
      (setq ends (find-end-chars (char s start)))
      (setq endchars
	    (remove-if #'null
		       (mapcar #'(lambda (x) 
				   (find-char-match s (1+ start) x))
			       ends)))
      (if endchars
	  (setq end (first (sort endchars #'<)))
	(setq end (1- (length s))))
      ;; If the end crosses or includes the final period,
      ;; adjust to the last char before it.
      (when (and (numberp period)
		 (< start period)
		 (>= end period))
	(setq end (1- period)))
      (setq startchar (char s start))
      (setq len (1+ (- end start)))
      (setq result 
	    (nconc result
		   (list (list start
			       end
			       (if (or (char= startchar #\<)
				       (char= startchar #\&))
				   1 ; length in "LE chars" for offsets
				 len)
			       len ; length in chars for highlighting

			       )))))))

(defun find-char-match (s offset cchar&type)
  (let ((end (length s))
	(cchar (first cchar&type))
	(type  (second cchar&type))
	(cur offset)
	cc)
    (loop
     (when (>= cur end)
	   (return nil))
     (setq cc (char s cur))
     (when (context-char-match cc cchar)
	   (case type
		 (:inclusive (return cur))
		 (:exclusive (return (1- cur)))
		 (t (error "Bad token end type: ~s" type))))
     (incf cur))))

(defun find-lechos (boundaries)
  (let (result current previous)
    (loop
     (when (null boundaries)
	   (return result))
     (setq previous current)
     (setq current (cons (calculate-lecho (first boundaries) previous)
			 (first boundaries)))
     (setq result (nconc result (list current)))
     (pop boundaries))))

(defun calculate-lecho (current previous)
  (if (null previous)
      (first current)
    (let* ((plecho (first previous))
	   (plechl (fourth previous))
	   (pend (third previous))
	   (start (first current))
	   gap)
      (setq gap (1- (- start pend)))
      (+ plecho plechl gap))))

(defun find-tokens (s)
  (let* ((period (final-period-offset s))
	 (starts (find-token-starts s period))
	 (boundaries (find-token-boundaries s starts period))
	 (tokendescs (find-lechos boundaries))
	 result (cur-end -1))
    ;; (format t "~%DESCS: ~s" tokendescs)
    ;; (pprint tokendescs)
    (when period
      (setf (aref *chrtrx* period)
	    '(#\* #\P #\E #\R #\I #\O #\D #\*)))
    (dolist (token tokendescs (nreverse result))
      (let* ((start (second token))
	     (end (third token))
	     (lecho (1+ (first token)))
	     (lechl (first (last token)))
	     (tokenchars (get-tokenchars start end))
	     (partokchars (remove #\Space
				  (filter-sgml-tags tokenchars)
				  )
			  )
	     (partok (when partokchars
		       (intern (coerce partokchars 'string)))))
	;; Don't build a token structure if the previous token ended after
	;; the current start point
	;; (format t "~%Start: ~s CurEnd: ~s" start cur-end)
	(cond ((<= start cur-end)
	       nil)
	      (t
	       (push `((rawtok ,(substring s start end))
		       (start ,start)
		       (end ,end)
		       (lecho ,lecho)
		       (lechl ,lechl)
		       (trx ,@tokenchars)
		       (partok ,partok))
		     result)
	       (setq cur-end end)))))))

(defun get-tokenchars (start end)
  (let (result)
    (loop
     (when (> start end)
       (return (nreverse result)))
     (dolist (char (aref *chrtrx* start))
       (push char result))
     (incf start))))

;; ----------------------------------------------------------------------
;; rewrite 'find-final-period' to work on a raw string.

(defun final-period-offset (string)
  "Returns an integer offset in the string for a final period, or NIL if there is none."
  (let* ((len (length string))
	 final-period-position)

    (do* ((i (1- len) (1- i)))
	 
	 ((<= i 0)
	  (setq final-period-position 0)) ; stop if at first char; return i

      ;;(format t "~%I: ~s" i)
      (cond ((char= (char string i) #\>)
	     ;; skip over tag
	     (setq i (or (position #\< string :from-end t :end i)
			 0)))
	    ;; otherwise, it's an input text char
	    ((char= (char string i) #\))
	     ;; skip over closing paren
	     ;(setq i (1- i))
	     nil
	     )
	    ((char= (char string i) #\")
	     ;; skip over closing quote
	     ;(setq i (1- i))
	     nil
	     )
	    ((char= (char string i) #\;)
	     ;; skip over char ent
	     (setq i (or (position #\& string :from-end t :end i)
			 0)))

	    ;; otherwise, it's an input text char
	    ((not (member (char string i) '(#\Space #\Tab) :test #'char=))
	     (setq final-period-position i)
	     (return))))

    ;;(format t "~%Final Period Position: ~s" final-period-position)
    (cond ((char-equal (char string final-period-position) #\.) ; found final period
	   final-period-position)		  
	  (t
	   nil))))

;; ----------------------------------------------------------------------
;; Disambiguator support function
;;
;; Given: indices into *tokval* for start/end of a phrase
;; Return: two values: lecho, lechl of the phrase

(defun range-lechinfo (start end)
  (let ((start-token (nth start *tokval*))
	(end-token (nth end *tokval*))
	start-lecho start-start end-start  end-lechl)
    ;;(format t "~%Start: ~s~%End: ~s" start-token end-token)
    (cond ((eq (second (assoc 'partok end-token))
	       '{/IDIOM})
	   (range-lechinfo
	    (1+ (second (assoc 'opener end-token)))
	    (1- end)))
	  ((not (and start-token end-token))
	   (values 0 0))
	  (t
	   (setq start-lecho (second (assoc 'lecho start-token)))
	   (setq start-start (second (assoc 'start start-token)))	   
	   (setq end-end (second (assoc 'end end-token)))
	   (values start-lecho (1+ (- end-end start-start)))))))


;; ----------------------------------------------------------------------
;;

#| ;; 30-May-96 by EHN -- moved to test-sentence.lisp
(defun adjust-for-idiom-tags (list tokval)
  (let* ((new (insert-phrase-tags list))
	 (new-save (copy-list new))
	 (vals tokval)
	 (i 0)
	 result last-open-tag)
    (loop
     (when (null new)
       (return (values new-save (reverse result))))
     ;;(when (null list)
     ;;  (error "Input shrank while adjusting for idiom tags?"))
     ;;(format t "~%Old: ~s~%New: ~s~%Result: ~s" list new result)
     (cond ((eq (first list) (first new))
	    (setq result (cons (first vals) result))
	    (pop vals)
	    (pop list)
	    (incf i)
	    (pop new))
	   ((eq (first new) '{IDIOM})
	    (setq last-open-tag i)
	    (setq result (cons `((PARTOK ,(first new)))
			       result))
	    (incf i)
	    (pop new))
	   ((eq  (first new) '{/IDIOM})
	    (setq result (cons `((PARTOK ,(first new))
				 (OPENER ,last-open-tag))
			       result))
	    (incf i)
	    (pop new))	   
	   (t (error "Can't continue while adjusting for idiom tags: ~s ~s"
		     new list))))))|#
	    
;; ----------------------------------------------------------------------

(defun build-cid-corpus! (infile outfile)
  (let (file form s)
    (with-open-file (out outfile :direction :output :if-exists :supersede)
      (with-open-file (in infile :direction :input)
	(loop
	 (setq file (read-line in nil in))
	 (when (eq file in) (return))
	 (print file)(force-output)
	 (with-open-file (cid file :direction :input)
           (loop
	     (setq form (read cid nil cid))
	     (when (eq form cid) (return))
	     (handler-case
	      (when (and (eq :sentence (second (assoc :type form)))
			 (setq s (second (assoc :text form))))
		(format out "~s~%" s))
	      (error () nil)))))))))
	     
;; ----------------------------------------------------------------------

(defun find-opening-tag (n)
  "N is a parser token number (1+ index into *tokval*).
   Return a parser token number, too."
  (let* ((curtok (nth (1- n) *tokval*))
	 (curraw (second (assoc 'rawtok curtok)))
	 (opentok (substring curraw 2 -2))
	 (openlen (length opentok))
	 short)
    ;;(format t "~%Looking for: ~s" opentok)
    (unless curtok
      (error "Can't find *tokval* info for token ~s!" n))
    (do*
	((i (- n 2) (1- i)))
	((< i 0)
	 (error "Couldn't find opening tag for token ~s!" n))
      (setq curraw (second (assoc 'rawtok (nth i *tokval*))))
      ;;(format t "~%Checking ~s (~s) against ~s (~s)"
      ;;  opentok openlen curraw (length curraw))
      (when (> (length curraw) openlen)
	(setq short (substring curraw 1 (1- (+ 1 openlen))))
	(when  (equal opentok short)
	  (return (1+ i)))))))

;; ----------------------------------------------------------------------

(defvar *singleton-tokens*)

(setq  *singleton-tokens*
       (list
	"stepref"
	"blankline"
	"figref"
	"ftnref"
	"ieref"
	"tblref"
	"teststepref"))

(defun singleton-token-match-p (token singleton)
  (let ((len (length singleton)))
    (and (> (length token) len)
	 (equal (substring token 1 len)
		singleton))))

;; 23-Jul-96 by EHN -- Obsolete (see disamb-bob.lisp)
;;
;; (defun disamb-highlight-range (toknum cat)
;;   (cond ((and (listp toknum)
;; 	      (eq '*multiple* (first toknum)))
;; 	 (let ((tokrange (sort (rest toknum) #'<)))
;; 	   (range-lechinfo (1- (first tokrange)) (1- (first (last tokrange))))))
;; 	((not (integerp toknum))
;; 	 (values 0 0))
;; 	(t (let* ((raw-token (second (assoc 'rawtok (nth (1- toknum) *tokval*))))
;; 		  (raw-token-length (length raw-token)))
;; 	     (cond ((and
;; 		     (> raw-token-length 2)
;; 		     (eq cat 'tag-element)
;; 		     ;; ignore PIs, singletons
;; 		     (not (find raw-token *singleton-tokens* :test #'singleton-token-match-p))
;; 		     (not (char= #\? (char raw-token 1)))
;; 		     ;; account for gnarly use of TAG-ELEMENT with *DQ* (!!)
;; 		     (char= #\< (char raw-token 0)))
;; 		    (let ((opentoknum (find-opening-tag toknum)))
;; 		      (range-lechinfo (1- opentoknum) (1- toknum))))
;; 		   (t (range-lechinfo (1- toknum) (1- toknum))))))))

;; ----------------------------------------------------------------------

;; NOTE! This should eventually be done by patching the place
;; where preprocess-root-string is defined:
;; ../code/cte-checker/release/checker-functions.lisp

;; 23-Jul-96 by EHN -- Obsolete (see test-sentence.lisp)
;;
;; (defun parse-string (sent)
;;   "Parses SENT string."
;;   (setq *sgml-tags-discarded* nil)  
;;   (let ((token-list (tokenize sent))) ; (preprocess-word-string sent)))
;;   (clrhash *cte-tag-counter*)		;this is for PI processing    
;;     (setq *parser-input-string* sent)
;;     (setq *parser-token-list* token-list)
;;     (cond ((null token-list)
;;            (setq *parser-failed* NIL)
;;            T)
;;           (T				; call the parser
;;            (parse-list token-list)))))

;; ----------------------------------------------------------------------

(defun test-analyzer! ()
  ;; This runs through all sentences in test/data/test-suite-from-cte.txt
  (time (test-cte))
  ;; This runs through the grammar example sentences
  (time (test-examples))
  (time (test-disambiguator 1 nil))
  ;; 07/24/97-igo: updating test-disambiguator with new parameters
  (time (run-acceptance-criteria))
)

;; ----------------------------------------------------------------------
;; From grammar-callouts.lisp

#|
(defun calculate-phrasal-score (input-fs)
  (let ((result 0)
	fs-list root)
    (unless (and input-fs
		 (listp input-fs)
		 (second input-fs))
      (return-from calculate-phrasal-score 0))
    (cond ((eq (first input-fs) '*OR*)
	   ;; 25-Sep-95 by EHN -- make this the same as multiple
	   (setq fs-list (list (second input-fs)))
	   ;;(setq fs-list (cdr input-fs))
	   )
	  ((eq (first input-fs) '*MULTIPLE*)
	   (setq fs-list (cdr input-fs)))
	  (t				; single fs
	   (setq fs-list (list input-fs))))
      
    (dolist (fs fs-list) 
      (when (and fs
		 (listp fs)
		 (listp (first fs)))
	(when (setq root (has-countable-root-p fs))
	  (incf result)
	  ;;(format t "~%ROOT: ~s" root)
	  )
	(dolist (slot fs)
	  (when (and slot
		     (listp slot)
		     (second slot)
		     (listp (second slot)))
	    (incf result (calculate-phrasal-score (second slot)))))))
    (values result)))
|#
(defun has-countable-root-p (fs)
  (let ((raw (second (assoc 'root fs)))
	(cat (second (assoc 'cat fs))))
    (cond ((or (stringp raw)
	       (and (symbolp raw)
		    (not (eq cat 'subclause))))
	   raw)
	  (t nil))))


;; ----------------------------------------------------------------------
;; <fast-n-filthy src="Igo">
(setf *analyzer-fs-complexity-limit* 300)
;; </fast-n-filthy>
;; 11/30/95-igo Hey!

(defvar *disambiguator-non-string-roots*)

(setf *disambiguator-non-string-roots*
      '(BLANKLINE BOOKTITLE BEGIN CALLOUT CC CHEM-FORM CIRCNO
		  CODE CODEDESC COLOR COPYR CPN CPN-ID CPNMOD DATE DAY
		  DIAGCODEDESC
		  DIAG-IEREF EFFECT-ITEM EMPHASIS END ENGLISH FIGREF FORMNO
		  FTNREF FTR-TITLE IDIOM IEREF IESUBTITLE IETITLE IE-TOPIC
		  INLINEITEM INREF INT-ITEM INT-ITEM2 JC LABEL LEVELREF MC
		  MDLDESC MDLGROUP MDLGROUPDESC MEDIA METRIC MODIFIER MONTH
		  NOMEN NONCATPUB PAGENO PARTNO PFX PHONELINE PHONENO PINGROUP
		  PINRANGE PINSNITEM PINSNLIST POSITION PROD-NAME PRODDESC
		  PROD-NAME PRODUSAGE PROPNAME PTYPE PUBDATE PUBTYPE PUBREF
		  PUBTITLE PUBTYPE QUALIFIER QUOTE RANKED-TESTREF REVNO SALESMDL
		  SALESMDLDESC SERIES SIZE SMCSCODE SMCSSECT SNGROUP SNRANGE
		  SPECVALUE STEPREF SUBJECTDESC SUBPRODDESC SUBSCRPT SUPSCRPT
		  TBLREF TC TERM TESTSTEPREF TITLE TMANTITLE TOPIC TRADEMARK
		  UNITSGRP VALTEXT WIRE YEAR
		  ;; 20-Oct-94 by EHN - we need this for internal tagging
		  ;; of fractions, even though it's not in :DTD-1.0
		  FRACTION
		  MIXED-FRACTION
		  ;; 12-Dec-94 by IGO - adding tag sequences - will most
		  ;; likely not be entirely handled in the highlighting
		  ;; code
		  CPN-ID-MODIFIER-SNRANGE-SEQ CPN-ID-SNGROUP-SEQ
		  CPN-ID-SNRANGE-SEQ
		  CPN-ID-TRADEMARK-SALESMDL-SEQ MDLGROUP-PROD-NAME-SEQ
		  MDLGROUP-SALESMDL-PINRANGE-PROD-NAME-SEQ
		  MDLGROUP-SALESMDL-PROD-NAME-SEQ MEDIA-IE-TOPIC-PAGENO-SEQ
		  MEDIA-PUBDATE-COPYR-SEQ MEDIA-PUBDATE-SEQ
		  PINGROUP-PROD-NAME-SEQ PROPNAME-NOMEN-SEQ
		  SALESMDL-PINRAGE-PROD-NAME-SEQ
		  SALESMDL-PINRANGE-PROD-NAME-SEQ SALESMDL-PINRANGE-SEQ
		  SALESMDL-PROD-NAME-SEQ SALESMDL-PROD-NAME-SNGROUP-SEQ
		  SALESMDL-SEQ SALESMDL-SNGROUP-PROD-NAME-SEQ
		  SALESMDL-SNGROUP-SEQ SALESMDL-SNRANGE-PROD-NAME-SEQ
		  SALESMDL-TRADEMARK-PINRANGE-PROD-NAME-SEQ
		  SALESMDL-TRADEMARK-PINRANGE-SEQ SALESMDL-TRADEMARK-SEQ
		  SERIES-PROD-NAME-SEQ TAG1-TAG2-SEQ TRADEMARK-NOMEN-SEQ
		  TRADEMARK-PINGROUP-SEQ TRADEMARK-PINRANGE-SEQ
		  TRADEMARK-PROD-NAME-SEQ TRADEMARK-SALESMDL-SEQ
		  TRADEMARK-SNGROUP-SEQ TRADEMARK-SNRANGE-SEQ))


