;;;; VERSION-SPACE.LISP  January 1988, An implementation of the Version Space Algorithm
;;;; for incremental learning from examples

;;;; Copyright (c) 1988 by Raymond Joseph Mooney. This program may be freely copied, used, or
;;;; modified provided that this copyright notice is included in each copy of this code
;;;; and parts thereof.

;;;; This is a basic version space learning algorithm.  In order to use it for any
;;;; representation of instances and generalizations, the user must define the
;;;; following functions:
;;;;
;;;; equal-generalizations(x ,y):          Returns T iff x and y are equal generalizations.
;;;; match(generalization, instance):      Returns T iff generalization matches instance.
;;;; more-general?(x, y):                  Returns T iff generalization x is strictly more general
;;;;                                         than generalization y.
;;;; initialize-g:                         Returns initial set of most general generalizations.
;;;; specialize-against(generalization,    Returns a list of minimal specializations of the
;;;;                          instance)      given generalization which do not match the instance.
;;;; generalize-to(generalization,         Returns a list of minimal generalizations of the
;;;;                     instance)           given generalization which do match the instance.
;;;;
;;;;  A sample set of these functions for nominal feature vectors represented as ordered lists
;;;;  is included.  A sample data set using this representation is in the file: FIGURE-DATA.

(defvar *s* nil               "The most specific set of generalizations (S)")
(defvar *g* nil               "The most general set of generalizations (G)")
(defparameter *trace-vs* nil  "Produces trace if set to T")

(defun train-version-space (examples)
  (version-space (positive-first examples)))

(defun positive-first (examples)
  ;;; Put positive examples all first
  (nconc (remove-if-not #'(lambda (example) (eq (first example) '+)) examples)
	 (remove-if-not #'(lambda (example) (eq (first example) '-)) examples)))

(defun version-space (examples)
  ;;; This function takes a list of examples where an example is a list whose first element
  ;;; is either + or - to indicate a positive or negative example, and whose second element
  ;;; is a description of that example (an instance).  It uses the version-space algorithm to 
  ;;; determine the sets of most specific and most general generalizations consistent with all of 
  ;;; the examples.  Returns list (S G)

  (setf *g* (initialize-g))
  (let ((first-pos (assoc '+ examples)) (converged nil))
    (setf examples (remove first-pos examples))
    (setf *s*  (list (second first-pos)))              ;initialize S to first positive instance
    (trace-print *trace-vs* "~%~%Example: ~A" first-pos)
    (trace-print *trace-vs* "~%S= ~A ~%G= ~A" *s* *g*)
    ;; process each additional example incrementally
    (dolist (example examples) 
      (trace-print *trace-vs* "~%~%Example: ~A" example)
      (cond ((eq (first example) '-) 
	     ;; if example is a negative example,
	     ;; then remove any members of S which match its instance and update G
	     (setf *s* (remove (second example) *s* :test #'reverse-match))
	     (update-g (second example)))
	    ((eq (first example) '+)
	     ;; if example is a positive example
	     ;; then remove any members of G which do not match its instance and update S
	     (setf *g* (remove (second example) *g* :test-not #'reverse-match))
	     (update-s (second example))))
      (trace-print *trace-vs* "~%S= ~A ~%G= ~A" *s* *g*)
      (cond ((or (null *g*)(null *s*))
	     ;; if either S or G is empty then concept cannot be described
	     (format t "~%Langauage is insufficient to describe the concept")
	     (return nil))
	    ((and (eq (length *s*) 1) (eq (length *g*) 1)
		  (equal-generalizations (first *s*)(first *g*)) (null converged))
	     ;; if the version space has just converged then say so
	     ;; but continue checking remaining examples for consistency
	     (format t "~%~%Convergence. Concept must be: ~A" (format-generalization (first *s*)))
	     (setf converged t))))
    (unless converged
      (if (and *s* *g*) (format t "~%~%Did not converge~%S= ~A~%G= ~A"
				(mapcar #'format-generalization *s*)
				(mapcar #'format-generalization *g*)))))
  (list *s* *g*))

(defun update-g (instance)
  ;;; Specializes generalizations in G so that none match the given instance for a negative example.

  ;; For each generalization in G which matches the instance compute minimal specializations which do
  ;; not match and remove those which are not more general than (or equal to) some element of S.
  (setf *g* (mapcan #'(lambda (generalization) 
			(if (match generalization instance)
			    (remove-if-not #'(lambda (specialization) 
					       (member specialization *s* :test 
						  #'(lambda (a b) (or (more-general? a b)
								      (equal-generalizations a b)))))
					   (specializations-against generalization instance))
			    (list generalization)))
		    *g*))
  ;; Remove from G those elements which are more specific than some other element in G
  (setf *g* (clean-g *g*)))

(defun clean-g (g)
  ;;; Remove from the list of generalizations g, any gens which are more specific or equal to some
  ;;; other gen in g.
  (dolist (generalization1 g)  
    (dolist (generalization2 (rest (member generalization1 g)))
      (cond ((more-specific? generalization2 generalization1)
	     (setf g (remove generalization2 g)))
	    ((or (more-specific? generalization1 generalization2)
		 (equal-generalizations generalization1 generalization2))
	     (setf g (remove generalization1 g))))))
  g)


(defun update-s (instance)
  ;;; Generalizes generalizations in S so that all match the given instance for a positive example

  ;; For each generalization in S which doesn't match the instance compute minimal generalizations
  ;; which do match and remove those which are not more specific  than (or equal to) some element of G.
  (setf *s* (mapcan #'(lambda (generalization) 
			(if (not (match generalization instance))
			    (remove-if-not #'(lambda (generalized) 
					       (member generalized *g* :test 
						  #'(lambda (a b) (or (more-specific? a b)
								      (equal-generalizations a b)))))
					   (generalizations-to generalization instance))
			    (list generalization)))
		    *s*))
  ;; Remove from S those elements which are more general than some other element of S
  (setf *s* (clean-s *s*)))

(defun clean-s (s)
  ;;; Remove from the list of generalizations s, any gens which are more general or equal to some
  ;;; other gen in g.
  (dolist (generalization1 s)  
    (dolist (generalization2 (rest (member generalization1 s)))
      (cond ((more-general? generalization2 generalization1)
	     (setf s (remove generalization2 s)))
	    ((or (more-general? generalization1 generalization2)
		 (equal-generalizations generalization1 generalization2))
	     (setf s (remove generalization1 s))))))
  s)

(defun reverse-match (instance generalization)
  ;;; Matches instance to generalization instead of vice-versa

  (match generalization instance))


(defun more-specific? (a b)
  ;;; Returns T iff generalization a is strictly more specific than generalization b

  (more-general? b a))



;;;;=================================================================================================
;;;; Functions specific to nominal feature vectors represented as ordered lists of feature values.
;;;;=================================================================================================

(defparameter *print-with-feature-names* nil "Print out generalizations with feature names")

(defun equal-generalizations (x y)
  ;;; Equivalence function for simple feature vector representation

  (equal x y))


(defun match (generalization instance)
  ;;; Match function for a simple feature vector representation where "?" is a wildcard
  
  (or (equal generalization instance)
      (and (or (equal (first generalization)(first instance))
	       (eq (first generalization) '?))
	   (match (rest generalization) (rest instance)))))


(defun more-general?  (x y)
  ;;; Returns T iff generalization x is strictly more general than generalization y for
  ;;; a simple feature vector representation. For x to be more general than y,  they must match 
  ;;; and x must have a "?" where y has a specific value; however y must never have a "?" where
  ;;; x has a specific value

  (cond ((or (null x)(null y)) nil)
	((and (eq (first x) '?) (not (eq (first y) '?)) (or (equal (rest x)(rest y)) 
							    (more-general? (rest x) (rest y)))) 
	 t)
	((equal (first x) (first y)) 
	 (more-general? (rest x) (rest y)))))


(defun initialize-g ()
  ;;; Initialize G to a set containing the all "?" feature vector

  (list (mapcar #'(lambda (feature) (declare (ignore feature)) '?) *domains*)))


(defun generalizations-to (generalization instance)
  ;;; Generalizes the given generalization in all ways just enough to match the instance.
  ;;; For simple feature vectors there is only one possible least generalization in which 
  ;;; differing feature values are changed to "?"s

  (list (if (or (null generalization)(null instance))
	    nil
	    (cons (if (equal (first generalization)(first instance))
		      (first generalization)
		      '?)
		  (first (generalizations-to (rest generalization)(rest instance)))))))


(defun specializations-against (generalization instance)
  ;;; Specializes the given generalization in all ways just enough so it doesn't match the instance.
  ;;; For simple feature vectors, for each "?" in the generalization there is a least specialization
  ;;; for each possible value in the domain for that feature which is different from the value
  ;;; in the instance.

  (do ((gen-rest generalization (rest gen-rest))
       (inst-rest instance (rest inst-rest))
       (gen-bef  nil (append gen-bef (list (first gen-rest))))
       (domain-rest *domains* (rest domain-rest))
       (specializations nil))
      ((or (null gen-rest)(null inst-rest)) specializations)
    (if (eq (first gen-rest) '?)
	(setf specializations (append (mapcar #'(lambda (value) (append gen-bef (list value)
									(rest gen-rest)))
					(remove (first inst-rest) (first domain-rest)))
				      specializations)))))


(defun format-generalization (gen)
  ;;; Format a generalization into a prettier form for output
  ;;; ( (feature-name value) ...) for features with constrained values
  (if *print-with-feature-names*
      (do ((gen-rest gen (rest gen-rest))
	   (feature-rest *feature-names* (rest feature-rest))
	   (formated-gen nil))
	  ((null gen-rest) (nreverse formated-gen))
	(unless (eq (first gen-rest) '?)
	  (push (list (first feature-rest) (first gen-rest)) formated-gen)))
      gen))



;;;; ==========================================================================================
;;;; Test and consistency checking functions
;;;; ==========================================================================================


(defun test-version-space (example s-g)
  ;;; Test an exmaple against results of version-space. Considers an example positive if
  ;;; if it matches the majority of the generalizations in S and G.  As a second value
  ;;; returns the fraction of S and G matching as a 'match score'

  (let ((count 0)
	(s (first s-g))
	(g (second s-g))
	(match-fraction 0))
    (dolist (generalization (append s g))
      (if (match generalization (second example))
	  (incf count)))
    (if (or s g) (setf match-fraction (/ count (+ (length s) (length g)))))
    (values (if (> match-fraction 0.5) '+ '-)
	    match-fraction)))


(defun check-consistency (examples)
  ;;; Checks the consistency of each generalization in S and G with the given list of examples.
  ;;; Prints an error message for each error found.  Can be used to check correctness after
  ;;; running version-space on a list of examples.

  (dolist (generalization (append *s* *g*))
    (dolist (example examples)
      (if (and (eq (first example) '-)(match generalization (second example)))
	  (format t "~%~%Error: ~A matches ~A" generalization example))
      (if (and (eq (first example) '+)(not (match generalization (second example))))
	  (format t "~%~%Error: ~A doesn't match ~A" generalization example)))))


;;;; ==========================================================================================
;;;; The following functions are for multiple category problems 
;;;; ==========================================================================================


(defun train-multi-version-space (examples)
  "Version space for multi-category data.  Return a list of (category S G)'s for
   for each category, where (S G) is the  version space result for the given
   category examples as + and all others as -"

  (dolist (cat *categories*)
    (setf (get cat 'training-examples) nil))
  (dolist (example examples)
    (push (rest example) (get (first example) 'training-examples)))
  (mapcar #'(lambda (category)
	      (let ((training-examples
		      (nconc (label-examples category '+)
			     (mapcan #'(lambda (other-category)
					 (label-examples other-category '-))
				     (remove category *categories*))))
		    (*categories* '(+ -)))
		(format t "~%~%Category: ~A" category)
		(cons category (train-version-space training-examples))))
	  *categories*))


(defun label-examples (category label)
  "Relabel the training-examples of a given category with the given class label"
  (mapcar #'(lambda (inst) (cons label inst)) (get category 'training-examples)))


(defun test-multi-version-space (example result-alist)
  "Test example for matching S-G result of each category and return category
   with the mighest fraction of matching S-G elements"

  (let (best-category (max-score 0))
    (dolist (result result-alist)
      (multiple-value-bind (class score)
	  (test-version-space example (rest result))
	(when (> score max-score)
	  (setf best-category (first result))
	  (setf max-score score))))
    best-category))
