;;;-----------------------------------------------------------------------;   
;;;  -*- Syntax: Common-Lisp; Package: USER; Base: 10; Mode: LISP -*-
;;;-----------------------------------------------------------------------;
;;;
;;; Last Modified                                                                                             3-Oct-95, 17:04
;;;
;;;         File: sgml-tags.lisp 
;;; File created: ;;;  8-Jun-92 by amf: 
;;;      Package: USER
;;;       Author: Alex Franz [amf@cs.cmu.edu]
;;;   
;;; File Description: functions to handle sgml tags for CTE checker
;;; 
;;;     See also: cte-checker.lisp, reader.lisp
;;; 
;;;-----------------------------------------------------------------------;
;;;
;;; Change Log:
;;; -----------
;;;
;;;  8-Jun-92 by amf: created
;;;
;;;-----------------------------------------------------------------------;


;;;-----------------------------------------------------------------------;
;;; Documentation:                                                        ;
;;;-----------------------------------------------------------------------;
;;;
;;; The reader turns <> into {}. (Nonterminal symbol char clash with
;;;    the parser.)
;;;
;;; We are given a list of chars. We find SGML tags in it
;;; and filter out unnecessary tags. We return a list of chars
;;; without the unnecessaary tags, and with double spaces removed.
;;;
;;; Good tags are truncated to just the tag name -- internal
;;; parameters are thrown out.
;;;
;;;  4-Sep-94 by EHN -- Note that the disambiguator must use the original
;;; input string to maintain synchonization with the LE. If the parser uses
;;; a different string (say, with tags removed), an error will result,
;;; since any TAG-POSITIONS it generates will be based on the shorter,
;;; changed string rather than the original input string.
;;;   Question: should we exit with no parse if there's an obsolete or
;;; otherwise bad tag in the sentence? In what cases is it profitable to
;;; strip them out and go on?
;;;   For now, add variable *sgml-tags-discarded*, and use it to keep the
;;; tags that are discarded; use in a post-processing step. Added warning
;;; and toggle *sgml-tags-discarded-warn*.

;;;-----------------------------------------------------------------------;
;;; Package Statements:                                                   ;
;;;-----------------------------------------------------------------------;

(in-package 'user)
;;; (use-package 'meister)

;;;-----------------------------------------------------------------------;
;;; Structures etc:                                                       ;
;;;-----------------------------------------------------------------------;

;;;-----------------------------------------------------------------------;
;;; Global Variables:                                                     ;
;;;-----------------------------------------------------------------------;

;;;  4-Sep-94 by EHN -- to hold the obsolete/garbage tags that were seen
;;; during a call to filter-sgml-tags.

(defvar *sgml-tags-discarded* nil)

;;;  4-Sep-94 by EHN -- When non-NIL, warn the user if some tags are
;;; discarded by FILTER-SGML-TAGS.

(defvar *sgml-tags-discarded-warn* nil)

;;; this is set in checker-functions.lisp
(defvar *analyzer-show-warnings*)

;;; tags that we want to keep in the input string

(defparameter *sgml-tags-strings*
  '(
    "?cte"
    "blankline"
    "/blankline"    
    "booktitle" "/booktitle"
    "begin" "/begin"
    "callout" "/callout"
    "cc" "/cc"
    "chem-form" "/chem-form"
    "circno"
    "/circno"
    "code"
    "/code"
    "codedesc" "/codedesc"
    "color"
    "/color"
    "copyr"
    "/copyr"            
    "cpn"
    "/cpn"
    "cpn-id"
    "/cpn-id" 
    "cpnmod"
    "/cpnmod"
    "date"
    "/date"        
    "day" "/day"
    "diagcodedesc" "/diagcodedesc"
    "diag-ieref" "/diag-ieref"
    #-:DTD-2.1"disa-ietitle"
    #-:DTD-2.1"/disa-ietitle"
    "effect-item" "/effect-item"
    "end" "/end"
    "english" "/english"
    "figref"
    "formno" "/formno"
    "ftnref"
    ;; idiom is used internally by AMT Analyzer
    "idiom" "/idiom"    
    "ieref"
    "iesubtitle"
    "/iesubtitle"    
    "ietitle" "/ietitle"
    "ie-topic"
    "/ie-topic"
    "inlineitem" "/inlineitem"
    "inref"    
    "/inref"
    "int-item"    
    "/int-item"        
    "int-item2"    
    "/int-item2"
    "jc" "/jc"
    #-:DTD-2.1"jobtype"
    #-:DTD-2.1"/jobtype"
    "label"
    "/label"    
    "levelref"
    "/levelref"
    "mc" "/mc"
    "mdldesc" "/mdldesc"
    "mdlgroup"
    "/mdlgroup"
    "mdlgroupdesc"
    "/mdlgroupdesc"    
    "media" "/media"
    "metric" "/metric"
    "mktver" "/mktver"
    "modifier" "/modifier"
    "month" "/month"
    "nomen" "/nomen"
    "noncatpub"
    "/noncatpub"        
    "pageno" "/pageno"
    "partno" "/partno"
    "pfx" "/pfx"
    "phoneline"
    "/phoneline"
    "phoneno"
    "/phoneno"    
    "pingroup"
    "/pingroup"
    "pinrange"
    "/pinrange"    
    "pinsnitem"
    "/pinsnitem"
    "pinsnlist"
    "/pinsnlist"
    "position" "/position"
    "prod-name"
    "/prod-name"
    "proddesc"
    "/proddesc"
    "prod-name"
    "/prod-name"
    "produsage"
    "/produsage"    
    "propname" "/propname"
    "ptype"
    "/ptype"    
    "pubdate" "/pubdate"
    "pubtype" "/pubtype"
    "pubref"
    "/pubref"        
    "pubtitle"
    "/pubtitle"
    "pubtype" "/pubtype"
    "qualifier" "/qualifier"
    "quote" "/quote"
    "ranked-testref" "/ranked-testref"
    "revno" "/revno"
    "salesmdl" "/salesmdl"
    "salesmdldesc"
    "/salesmdldesc"    
    "series"
    "/series"
    "size"
    "/size"    
    "smcscode" "/smcscode"
    "sngroup"  "/sngroup"
    "snrange" "/snrange"
    "specvalue" "/specvalue"
    "stepref" "/stepref"
    "subjectdesc"
    "/subjectdesc"
    "subproddesc"
    "/subproddesc"
    "subscrpt" "/subscrpt"
    "supscrpt" "/supscrpt"    
    "tblref"
    "tc" "/tc"
    "term"
    "/term"
    "teststepref"
    "title" "/title"
    "tmantitle"
    "/tmantitle"
    "trademark"
    "/trademark"
    "unitsgrp"
    "/unitsgrp"
    "valtext"
    "/valtext"
    "wire"
    "/wire"
    "year" "/year"
    ))

(defvar *sgml-tags-lists*)

;;; tags where we want to take some action

(defparameter *sgml-actions-strings*
  '(("?cte" . action-cte-tag)))

;;; '(("callpara" . begin-callpara-function)
;;;   ("/callpara" . end-callpara-function)
;;;   ("cause-error" . begin-cause-error-function)
;;;   ("/cause-error" . end-cause-error-function))

(defvar *sgml-actions-lists*)

;;; this is obsolete and not currently used.

(defparameter *sgml-tags-translations*
  '(
;;;    ("isdate" . "pubdate")
;;;    ("/isdate" . "/pubdate")   
;;;    ("revdate" . "pubdate")
;;;    ("/revdate" . "/pubdate")    
;;;    ("compdate" . "pubdate")
;;;    ("/compdate" . "/pubdate")
;;;    ("termdate" . "pubdate")
;;;    ("/termdate" . "/pubdate")
    ))
    
(defvar *sgml-tags-translations-lists*)

;;; character entities that must be broken up into separate tokens

(defparameter *char-ent-token-strings*
  '(
    "lsquo"
    "rsquo"
    "ldquo"
    "rdquo"
    ))

(defvar *char-ent-token-lists*)

;;; these tags have attributes that will be preserved

(defparameter *tags-that-must-be-kept-whole-strings*
  '(
    "allmdl"
    "blankline"
    "code"
    "figref"
    "ftnref"
    "ieref"
    "int-item"
    "itemref"
    "jobtype"
    "levelref"
    "ranked-testref"
    "stepref"
    "tblref"
    "teststepref"
    "title"
    "titleref"
    "wire"
    "?cte"
    ))

(defvar *tags-that-must-be-kept-whole-list*)


;;;------------------------------------------------------------------;
;;; convert tags names to needful format  (urgh)

(defun string-to-char-list (string)
  (let (result)
    (dotimes (i (length string) (nreverse result))
      (setq result (cons (char-upcase (char string i))
			 result)))))

(setq *sgml-tags-lists*
      (mapcar #'string-to-char-list *sgml-tags-strings*))


(setq *sgml-actions-lists*
      (mapcar #'(lambda (x) (cons (string-to-char-list (car x))
				  (cdr x)))
	    *sgml-actions-strings*))


(setq *sgml-tags-translations-lists*
      (mapcar #'(lambda (x) (cons (string-to-char-list (car x))
				  (string-to-char-list (cdr x))))
	    *sgml-tags-translations*))

(setq *char-ent-token-lists*
      (mapcar #'string-to-char-list *char-ent-token-strings*))

(setq *tags-that-must-be-kept-whole-list*
      (mapcar #'string-to-char-list *tags-that-must-be-kept-whole-strings*))

;;;-----------------------------------------------------------------------;
;;; Macros:                                                               ;
;;;-----------------------------------------------------------------------;


;;;-----------------------------------------------------------------------;
;;; Functions:                                                            ;
;;;-----------------------------------------------------------------------;

(defun filter-sgml-tags (charlist)
  ;;  4-Sep-94 by EHN -- reset this for each charlist we process.
  ;; 14-Sep-94 by EHN -- called multiple times for each call to TEST-SENTENCE,
  ;; so reset elsewhere.
  ;; (setq *sgml-tags-discarded* nil)
  (do ((char (car charlist) (car charlist))
       tag-closing-pointer tagname-end-pointer tagname actionpair
       result short-tagname brace-level i delete-current-tag
       ;;  4-Sep-94 by EHN -- to hold stringified tag when saving/warning
       ;; about discarded tags.
       saved-tagname)
      ((null charlist)
       (nreverse result))

    (cond ((char= char #\{)
	   ;; find closing paren
	   (setq brace-level 0)
	   (setq i 0)
	   (setq tag-closing-pointer
		 (dolist (my-char charlist i)
		   (cond
		     ((char= my-char #\{)
		      (incf brace-level))
		     ((char= my-char #\})
		      (setq brace-level (1- brace-level))
		      (if (= brace-level 0)
			  (return i))))
		   (incf i)))
			  
	   ;; (setq tag-closing-pointer (position #\} charlist))
	   ;; find tagname

	   (setq tagname-end-pointer
		 (or (position #\SPACE charlist :end tag-closing-pointer)
		     tag-closing-pointer))

	   (setq tagname (sgml-tag-charlist-to-name charlist tag-closing-pointer))
	   (setq short-tagname (sgml-tag-charlist-to-name charlist tagname-end-pointer))
	   ;; chop off tag from charlist
	   (setq charlist (nthcdr (1+ tag-closing-pointer) charlist))
	   ;; call tag action function, if there is one
	   (if (setq actionpair (member short-tagname *sgml-actions-lists* :key #'car :test #'equalp))
	       (setq delete-current-tag (funcall (cdr (car actionpair)) tagname)))

	   ;; add tagname to result, if tag is kept
	     
	   (cond ((and (not delete-current-tag)
		       (member short-tagname *sgml-tags-lists* :test #'equalp))
		  (if (or (null result) ; first char
			  (not (char= (car result) #\space)))
		      (push #\space result)) ; then
		  (push #\{ result)
		  (dolist (c tagname)
		    (push (char-upcase c) result))
		  (push #\} result)
		  (push #\space result))

		 (t	;; we don't keep this tag
		  (setq delete-current-tag nil)
		  ;;  4-Sep-94 by EHN -- save the tag that was discarded.
		  (setq saved-tagname (coerce short-tagname 'string))
		  (unless (member saved-tagname *sgml-tags-discarded*
				  :test #'equal)
			  (push saved-tagname *sgml-tags-discarded*)
			  (when *sgml-tags-discarded-warn*
				(warn "FILTER-SGML-TAGS: Discarding <~a>"
				      saved-tagname)))
		  (push #\space result))))

	  ((char= char #\&)		; character entity
	   ;; find closing delimiter
	   (setq tagname-end-pointer (or (position #\@ charlist)
					 2))
	   (setq tagname (subseq charlist 1 tagname-end-pointer))
	     
	   ;; chop off char ent from charlist
	   (setq charlist (nthcdr (1+ tagname-end-pointer) charlist))

	   ;; break up char entity into separate token, if necessary
	   (cond ((member tagname *char-ent-token-lists* :test #'equalp)
		  (if (or (null result) ; first char
			  (not (char= (car result) #\space)))
		      (push #\space result)) ; then
		  (push #\& result)
		  (dolist (c tagname)
		    (push (char-upcase c) result))
		  (push #\@ result)
		  (push #\space result))

		 (t			; the char entity is not broken up into separate token
		  (push #\& result)
		  (dolist (c tagname)
		    (push (char-upcase c) result))
		  (push #\@ result))))
	    
	  ((char= char #\space)
	   (if (or (not result)		;  first char
		   (not (char= (car result) #\space)))
	       (push #\space result))	; then
	   (setq charlist (cdr charlist)))
	  (t				; any other character gets copied
	   (push char result)
	   (setq charlist (cdr charlist))))))

;;; find the real tagname:
;;;
;;; 1. if we match something on the translations table, do that.
;;; 2. else, take first contiguous name in tag (i.e. drop all
;;; tag-internal structure)

(defun sgml-tag-charlist-to-name (charlist tag-closing-pointer)
  (let ((whole-tag (subseq charlist 1 tag-closing-pointer))
	tagname-end-pointer result)
    
    (dolist (tags-dotted-pair *sgml-tags-translations-lists*)
      (if (charlist-matches-charlist (car tags-dotted-pair) whole-tag)
	  (return-from sgml-tag-charlist-to-name (cdr tags-dotted-pair))))
    ;; we tried all matches, and they all failed
    
    (setq tagname-end-pointer
	  (or (position #\SPACE charlist :end tag-closing-pointer)
	      tag-closing-pointer))

    (setq result (subseq charlist 1 tagname-end-pointer))
    (if (member result *tags-that-must-be-kept-whole-list* :test #'equalp)
	(setq result
	      (substitute #\_  #\SPACE
			  (subseq charlist 1 tag-closing-pointer))))
    ;; remove duplicate underscores (corresponding to duplicate spaces)
    (values result)))


(defun charlist-matches-charlist (short-list long-list)
  (dotimes (i (length short-list))
    (if (or (null (nth i long-list))
	    (not (char= (nth i short-list)
			(nth i long-list ))))
	(return-from charlist-matches-charlist NIL)))
  
  ;; all characters on the short list match the long list
  (values t))
  
(defun begin-callpara-function ()
  (if *analyzer-show-warnings*
      (format *standard-output* "~& ;;; Entering Callout Paragraph...~%")))


(defun end-callpara-function ()
  (if *analyzer-show-warnings*
      (format *standard-output* "~& ;;; Leaving Callout Paragraph.~%")))
      

(defun begin-cause-error-function ()
  (warn "You have entered the twilight zone..."))

(defun end-cause-error-function ()
  (warn "The parser has caused an intentional error for testing purposes")
  (error "The parser has caused an intentional error for testing purposes"))

(defun action-cte-tag (tag)
  (let (type tag-fs eval)
    ;;(format *standard-output* "~&[eval-cte-eval-tag]: ~A~%" tag)
    ;; parse cte tag
    (setq tag (coerce tag 'string))
    (setq tag (concatenate 'string "{" tag "}"))    
    (setq tag-fs (parse-cte-tag tag))
    (setq type (second (assoc 'cat tag-fs)))
    (cond ((eq type 'cte-eval)
	   ;; action!
	   ;;(pns tag-fs)
	   (setq eval (second (assoc 'eval tag-fs)))
	   (setq eval (replace-string "\"" "*dq*" eval))
	   (setq eval (replace-string "(" "[" eval))
	   (setq eval (replace-string ")" "]" eval))
	   (setq eval (replace-string "#" "*hash*" eval))
	   (setq eval (replace-string "'" "*apostrophe*" eval))
	   (setq eval (replace-string ":" "*colon* " eval))	   
	   ;;(print eval)
	   (eval (read-from-string eval))
	   ;; return t to indicate tag should be deleted
	   (values t))
	  (t
	   ;; return NIL to indicate tag should not be deleted
	   (values NIL)))))
    

	
