;;;; UNIMEM.LISP  -  Lebowitz's incremental conceptual clustering program.
;;;;                 (See Lebowitz's ML2 article and MLJ 2:2 article for further details.)

;;;; This system incrementally processes unclassified examples,
;;;; dynamically modifying a hierarchy of concepts as it does so.
;;;; First, the system determines all of the places in the current
;;;; concept hierarchy that a new instance belongs.  (There may be
;;;; more than one.)  This is done "pragmatically" - matches do not
;;;; have to be exact.  Once the locations for a new instance are determined,
;;;; UNIMEM determines if the new instance is similar enough to any instances
;;;; previously stored.  If so, a new node in the concept hierarchy is formed,
;;;; using commonalities between the matching instances to define the new concept.
;;;; Statistics are kept on the features that describe a concept.  Unreliable features
;;;; are discarded, and if enough features of a concept are discarded, the concept
;;;; itself is also discarded (along with all of the instances stored there).
;;;; (Discarding of features and concepts is not implemented and is left as a homework exercise.)

;;;; Copyright (C) 1988 by Jude William Shavlik and Raymond Joseph Mooney
;;;;       This program may be freely copied, used, or 
;;;;       modified provided that this copyright notice 
;;;;       is included in each copy of this code and parts thereof.

;;;; The following properties of concepts are maintained:
;;;    features            - a conjuction description of the concept (feature/value pairs)
;;;    sub-concepts        - concepts directly below this one in the concept hierarchy
;;;    parent              - the concept directly above
;;;    instances           - instances located at this concept
;;;    feature-confidences - confidence values for the features of this concept
;;;    features-discarded  - features of this concept deemed irrelevant
;;;    frozen-features     - features deemed definitely significant

;;;; Also, features have the property "type" (one of nominal, linear, or structured).
;;;; If a feature does not have a type, it is assumed to be nominal.
;;;; Currently, the system only handles nominal and linear features.

;;;; Sample data for the system can be found in the file DISCOVERY-DATA.
;;;; This file also describes the expected representation of instances.
;;;; A strength of UNIMEM is that is handles inconsistent and incorrect data.
;;;; All of the feature values of an instance do not have to be specified.

;;;; --------------------------------------------------------------------------------------
;;;;                       Global Variables
;;;; --------------------------------------------------------------------------------------

(defvar *root*                        nil "The root of the concept hierarchy.")
(defvar *trace-UNIMEM?*                 t "Report on the progress of the program?")
(defvar *concept-match-threshold*    0.8 "Min partial match score for category membership.")
(defvar *acceptable-match-threshold* 0.75 "Min partial match score between two features.")
(defvar *new-concept-threshold*      0.50 "Min match between two ex's if new concept formed.")
(defvar *missing-feature-score*      0.10 "Partial match score for absent feature.")
(defvar *features-needed-to-gen*        1 "Min common features if new concept is to be created.")
(defvar *delete-confidence-level*      -1 "Confidence level at which a feature is deleted.")
(defvar *freeze-confidence-level*       5 "Confidence level at which a feature is permanent.")
(defvar *keep-concept-threshold*     0.33 "Frac of initial features remaining to keep concept.")

;;;; --------------------------------------------------------------------------------------
;;;;                      The Main Functions
;;;; --------------------------------------------------------------------------------------

(defun unimem (example)
  ;;; Insert this example into memory, altering the structure of memory as necessary.
   (format t "~%Considering ~A, which has features~%  " example)
   (print-list-contents (eval example) t t)
   (dolist (answer (unimem-search *root* (eval example)))
     (update (first answer) example (second answer))))

(defun run-unimem (examples &optional (randomize t))
  ;;; Run a collection of examples through the UNIMEM program.
  ;;; Randomize example order if flag is T
   (format t "~%~%Initiating UNIMEM ... ~%~%")
   (gensym 0)
   (setf *root* (gensym "Concept-"))
   (initialize-unimem-concept *root*)
   (dolist (example (if randomize (permute examples) examples))
     (unimem example))
   (format t "~%Finished processing the provided examples.~%~%The final memory structure:~%~%")
   (report-memory *root*)
   (format t "~%~%")
   *root*)

(defun unimem-search (concept unexplained-features)
  ;;; Determine where in the current tree this concept belongs.
  ;;; For each most specific concepts found, return a list of (<concept> <unexplained-features>)
  (evaluate-concept-features concept unexplained-features)  ; Update feature scores.
  (unless (remove-concept? concept)                         ; Check for deleted concept
    (if (concept-member? concept unexplained-features)        ; Close enough match?
      (let ((remaining-unexplained-features 
	     (collect-unexplained-features concept unexplained-features)))
        (or (mapcan #'(lambda (sub-concept) (unimem-search sub-concept remaining-unexplained-features))
                    (get concept 'sub-concepts))      ; See if concept also fits lower in tree.
            (list (list concept remaining-unexplained-features))))))) ; If not, here's the place.

(defun update (concept new-instance new-features &aux inserted?)
  ;;; Insert this instance into  memory.  Currently it fits here.  However,
  ;;; see if any new concepts should be created.
   (when *trace-UNIMEM?*
      (format t " ~A matches the concept defined by the features~%  " new-instance)
      (print-list-contents (get-all-features concept) t t)
      (format t " Other instances stored here:")
      (print-list-contents (get concept 'instances) t))
   (dolist (old-instance (get concept 'instances)) ; Look at all of the instances already stored here.
     (if (close-enough-to-build-new-concept? new-instance old-instance)
	 (setf inserted? (or inserted?		; Record if the new instance has been inserted.
			     (build-new-concept concept	; See if a new concept possible.
						old-instance new-instance new-features)))))
   (unless inserted?
     (push new-instance (get concept 'instances))
     (if *trace-UNIMEM?* 
        (format t " Inserting ~A into this concept.~%" new-instance))))

(defun build-new-concept (parent old-instance new-instance features-to-consider)
  ;;; Build a new concept under this parent concept. 
  ;;; Put these two instance in it.  The features of the new node are those
  ;;; features in the feature list that have "close-enough" values in the two instances.
  ;;; Make sure there are enough features for the the new concept.
  ;;; (The features to consider are a subset of new-instance's features.)
   (let* ((old-instance-features (eval old-instance))
          (common-features (remove-if-not 
                             #'(lambda (feature) 
				 (>= (feature-match-score feature old-instance-features)
				     *acceptable-match-threshold*))
                             features-to-consider)))
     (if (>= (length common-features) *features-needed-to-gen*) ; Enough common  features?
       (let ((new-concept (gensym "Concept-")))
         (initialize-unimem-concept new-concept)
         (setf (get parent 'instances) (remove old-instance (get parent 'instances)))
         (push new-concept (get parent 'sub-concepts))
         (setf (get new-concept 'parent) parent)
         (setf (get new-concept 'instances) (list old-instance new-instance))
         (setf (get new-concept 'features) 
	       (combine-feature-values common-features old-instance-features)) 
         (setf (get new-concept 'feature-confidences)
               (mapcar #'(lambda (feature-value-pair)  ; Count the two "creating" instances. 
			   (list (first feature-value-pair) 2)) common-features))
         (when *trace-UNIMEM?*
           (format t " New concept formed from ~A and ~A.~%" new-instance old-instance)
           (format t " Its features are~%   ")
           (print-list-contents (get new-concept 'features) t t))
         new-concept))))  ; Indicate a new concept was formed.

(defun evaluate-concept-features (concept unexplained-features)
  ;;; Keep statistics on the value of the features of this concept.
  ;;; Discard low-scoring features and "freeze" high scoring ones.
  ;;; If *trace-UNIMEM?* is set, report deletions and freezings.
   nil) ; TO BE WRITTEN, USING THE RELEVANT GLOBAL VARIABLES DEFINED ABOVE.

(defun remove-concept? (concept)
  ;;; If a concept loses too many of its features, discard the concept.
  ;;; Report deletion if *trace-UNIMEM?* is set.
  ;;; TO BE WRITTEN USING THE RELEVANT GLOBAL VARIABLES DEFINED ABOVE.
  nil)

(defun collect-unexplained-features (concept instance-features 
					     &aux (concept-features  (get concept 'features)))
  ;;; Collect all of the instance features that do NOT match a feature of this concept.
   (remove-if #'(lambda (instance-feature) 
		  (>= (feature-match-score instance-feature concept-features)
		      *acceptable-match-threshold*))
              instance-features))

(defun close-enough-to-build-new-concept? (instance1 instance2 
                                           &aux (features1 (eval instance1)) 
					        (features2 (eval instance2)))
  ;;; Determine if these instances are close enough to merit creating a new concept.
   (>= (min (concept-match-score features1 features2)  ; The match is unsymmetrical when the two 
            (concept-match-score features2 features1)) ; instances have different features 
      *new-concept-threshold*))                        ; (some may be missing).

(defun concept-match-score (concept-features instance-features 
					     &aux (number-of-features (length concept-features)))
  ;;; Calculate a partial match of this instance to this concept.
  ;;; This is done by summing the scores for matching each feature of the concept
  ;;; and then dividing by the maximum possible score (to normalize).
   (if (> number-of-features 0)
      (/ (reduce #'+ (mapcar #'(lambda (concept-feature) 
				 (feature-match-score concept-feature instance-features))
                         concept-features))
         number-of-features)
      1))

(defun feature-match-score (feature possible-matches &aux
			    (match-feature (assoc (first feature) possible-matches)))
  ;;; Return the scoring match of this feature when compared to the possible matches.
   (if match-feature
       (score-two-features feature match-feature)
      *missing-feature-score*))

(defun score-two-features (feature1 feature2)
  ;;; Score the match between these two feature/value pairs.
  ;;; Return a result in [0..1].
  (if (eql (second feature1) (second feature2))
      1						; Perfect match.
      (case (get (first feature1) 'type)
	    (nominal 0)				; No partial score here.
	    (linear     (partially-score-linear-feature    
			  feature1 feature2))	; Allow partial scoring.
	    (structured (partially-score-structured-feature 
			  feature1 feature2))	; Allow partial scoring.
	    (otherwise 0))))			; Nominal is the default.


(defun partially-score-linear-feature (feature1 feature2)
  ;;; Return a number between 0 and 1 indicating the degree to which
  ;;; two linear features match. Calculated as the ratio of the distance
  ;;; between to values over the maximum distance possible based on the
  ;;; min and max values stored on the linear-domain prop of the feature

  (let ((domain (get (first feature1) 'linear-domain)))
    (- 1 (/ (abs (- (second feature1)(second feature2)))
       (- (second domain) (first domain))))))

(defun partially-score-structured-feature (feature1 feature2)
  ;;; Return a number between 0 and 1 indicating the degree to which
  ;;; two structured feature values match
    (error "TO BE WRITTEN."))

(defun combine-feature-values (features1 features2)
  ;;; For the feature/value pairs in features1, merge
  ;;; the values with the corresponding values in features2.
   (mapcar #'(lambda (feature1 &aux (feature2 (assoc (first feature1) features2)))
                (if (equal feature1 feature2)  ; See if an exact match.
                  feature1
                  (case (get (first feature1) 'type)
                    (nominal    (error "Shouldnt occur.")) ; Shouldnt have matched partially.
                    (linear     (list (first feature1)
				      (/ (+ (second feature1)(second feature2)) 2)))
		                 ; Combine two linear values by taking average
                    (structured (error "TO BE WRITTEN."))  ; Combine two structured values.
                    (otherwise  (error "Shouldnt occur."))))) ; Nominal is the default.
           features1))

(defun concept-member? (concept instance-features)
  ;;; Determine if these instance features indicate that 
  ;;; the instance is a member of this concept.
   (>= (concept-match-score (get concept 'features) instance-features) 
       *concept-match-threshold*))

;;;; --------------------------------------------------------------------------------------
;;;;                               UTILITY FUNCTIONS
;;;; --------------------------------------------------------------------------------------

(defun initialize-unimem-concept (concept)
  ;;; Initialize this concept.
   (setf (get concept 'features)            nil) ; A list of feature/value pairs.
   (setf (get concept 'features-discarded)  nil) ; A list of feature names.
   (setf (get concept 'frozen-features)     nil) ; A list of feature names.
   (setf (get concept 'feature-confidences) nil) ; A list ( ... (feature confidence) ... )
   (setf (get concept 'instances)           nil) ; A list of examples names.
   (setf (get concept 'parent)              nil) ; The name of the parent concept.
   (setf (get concept 'sub-concepts)        nil)) ; A list of sub-concepts.

(defun hierarchy-instances (&optional (concept *root*))
  ;;; Construct a nested list illustrating the instances stored in the
  ;;; entire memory under concept.

   (append (get concept 'instances)
	 (mapcar #'(lambda (sub-concept) (hierarchy-instances sub-concept))
		 (get concept 'sub-concepts))))

(defun report-memory (concept &optional (indent 0) &aux (instances (get concept 'instances)))
  ;;; Report the contents of memory.
   (format t "~vT~A " indent concept)
   (if instances
      (progn (format t "has instances") (print-list-contents instances t))
      (format t "has no associated instances.~%"))
   (report-concept-feature-status concept indent)
   (mapc #'(lambda (sub-concept) (report-memory sub-concept (+ 3 indent))) 
         (reverse (get concept 'sub-concepts))))

(defun report-concept-feature-status (concept indent)
  ;;; Report the status of the features of this concept.
  ;;; Describe confidence values and report discarded features.
  ;;; (Frozen features are considered to have infinite confidence
  ;;; values, represented by ^'s.)
   (let ((active      (get concept 'features))
         (discarded   (get concept 'features-discarded))
         (frozen      (get concept 'frozen-features))
         (confidences (get concept 'feature-confidences)))
      (format t "~vT " indent) 
      (format t "Its features [and confidences] are~%")
      (format t "~vT" indent)
      (if active 
         (mapc #'(lambda (pair) 
                   (let ((name (first pair)) (value (second pair)))
                     (format t " ~A=~A[~A]" name value
                              (if (member name frozen) '^ (second (assoc name confidences))))))
               active)
         (format t " <none>"))
      (when discarded 
        (format t "~%~vT Discarded features:" indent)
        (print-list-contents discarded))
      (format t "~%~%")))
       
(defun print-list-contents (list &optional linefeed? feature-value-pairs?)
  ;;; Print this list without the encapsulating parantheses.
   (if list
     (if feature-value-pairs?  ; Use the notation feature=value
        (dolist (pair list) (format t " ~A=~A" (first pair) (second pair)))
        (dolist (item list) (format t " ~A" item)))
     (format t " <none>"))
   (if linefeed? (format t "~%")))
   
(defun get-all-features (concept)
  ;;; Collect all of the features of this concept, removing overridden ones.
   (remove-if-overridden (collect-every-feature concept)))

(defun collect-every-feature (concept)
  ;;; Collect all the features of this concept, including inherited ones.
  ;;; Later features in the list should override earlier ones.
   (append (if (get concept 'parent) (collect-every-feature (get concept 'parent))) 
           (get concept 'features)))

(defun remove-if-overridden (features)
  ;;; Remove all of those features in this list that have another value later down in the list.
   (cond ((null features) nil)
         ((assoc (first (first features)) (rest features)) (remove-if-overridden (rest features)))
         (t (cons (first features) (remove-if-overridden (rest features))))))

(defun permute (list)
  "Randomize the order of the elements in this list."
  (mapcar #'(lambda (pair) (rest pair))
	  (sort (mapcar #'(lambda (item) (cons (random 1000) item)) list)
		#'(lambda (a b) (> (first a) (first b))))))

