;;;; LISP interface to Quinlan's C4.5 C code for use with the UNIVERSAL-TESTER.
;;;; Provides systems for C4.5, C4.5-UNPRUNED, and C4.5-RULES.  It is best to
;;;; put all three in the given order in the system list, in which case
;;;; the trees pruduced by C4.5 are re-used by the other two.

;;;; Copyright (c) 1993 by Raymond Joseph Mooney. This program may be freely 
;;;; copied, used, or modified provided that this copyright notice is included
;;;; in each copy of this code and parts thereof.

;;;; Uses LUCID function "shell" to access C4.5 executables which must be in your 
;;;; UNIX path. Quinlan's code has been modified to allow separate training and
;;;; testing by making executables for C4.5test and C4.5rtest for testing trees and 
;;;; rules respectively on a set of data. LISP communicates to C through a number of 
;;;; files including those needed and generated by C4.5 (stem.names, stem.data,
;;;; stem.test, stem.tree, stem.unpruned, stem.rules) plus stem.out and stem.time
;;;; to store output and time information.  The UNIVERSAL-TESTER output file name
;;;; is used as the stem.

(in-package 'user)
(provide 'c4.5)

(setf (get 'c4.5 'external) t)   ; declare C4.5 to be defined outside LISP
(setf (get 'c4.5 'parameters) '(*c4.5-prune-tree* *c4.5-min-examples* *c4.5-prune-confidence*
				*c4.5-group-values*))
(setf (get 'c4.5 'expect-training-error) t)   

(defparameter *c4.5-prune-tree* t        "Use pruned decision tree")
(defparameter *c4.5-prune-confidence* 25 "Percent confidence factor for pruning")
(defparameter *c4.5-min-examples* 2      "Minimum number of examples at some branch for a split")
(defparameter *c4.5-group-values* nil    "Tries to create groups of values on same branch")

(defun prepare-c4.5 (&optional (file-stem *output-file*))
  (with-open-file (output (merge-pathnames ".names" file-stem) :direction :output)
      (format output "~A" (first *categories*))
      (dolist (category (rest *categories*))
	(format output ", ~A" category))
      (format output ".~%~%")
      (dolist (feature *feature-names*)
	(format output "~A:~20T" feature)
	(if (linear-feature-p feature)
	    (format output "continuous.~%")
	    (progn
	      (let ((domain (feature-domain feature)))
		(format output "~A" (first domain))
		(dolist (value (rest domain))
		  (format output ", ~A" value))
		(format output ".~%")))))))


(defun prepare-trial-c4.5 (total-training-examples test-examples)
  (declare (ignore total-training-examples))
  (write-c4.5-datafile nil)              ; empty out training data file
  (write-c4.5-datafile (make-ordered-examples test-examples) t)) ; write test data file
  

(defun train-and-test-c4.5 (training-examples new-training-examples test-examples)
  (let ((num-train (length training-examples))
	(num-test (length test-examples))
	(result-data (make-list (length *data-format*))))
    (format t "~%~%Training C4.5...")
    (train-c4.5 new-training-examples)
    (set-field result-data 'train-time (extract-c4.5-time))
    (format t "Testing Training Data...")
    (multiple-value-bind (tree-size train-errors)
	(test-c4.5 t)
      (set-field result-data 'train-accuracy (percent-correct train-errors num-train))
      (set-field result-data 'concept-complexity tree-size))
    (format t "Testing Test Data...")
    (multiple-value-bind (tree-size test-errors)
	(test-c4.5)
      (declare (ignore tree-size))
      (set-field result-data 'test-accuracy (percent-correct test-errors num-test))
      (set-field result-data 'test-time (extract-c4.5-time)))
    result-data))

(defun percent-correct (errors total)
  (if (zerop total)
      100
      (* 100.0 (/ (- total errors) total))))

(defun train-c4.5 (new-examples)
  (write-c4.5-datafile (make-ordered-examples new-examples) nil t)
  (unix-run-c4.5 (concatenate 'string "c4.5 -x"
			      (unless (= *c4.5-min-examples* 2)
				(format nil " -m ~A" *c4.5-min-examples*))
			      (unless (= *c4.5-prune-confidence* 25)
				(format nil " -c ~A" *c4.5-prune-confidence*))
			      (if *c4.5-group-values* " -s"))))


(defun test-c4.5 (&optional eval-training)
  (unix-run-c4.5 (concatenate 'string "c4.5test"
			      (if eval-training " -t")
			      (unless *c4.5-prune-tree* " -u")))
  (extract-c4.5-results))


(defun write-c4.5-datafile (examples &optional as-test append? (file-stem *output-file*))
    (with-open-file (output (merge-pathnames (if as-test ".test" ".data") file-stem)
			    :direction :output
			    :if-exists (if append? :append :supersede)
			    :if-does-not-exist :create)
      (let ((feature-num (length *feature-names*)))
	(dolist (example examples)
	  (dotimes (i feature-num)
	    (format output "~A, " (elt (second example) i)))
	  (format output "~A.~%" (first example))))))


(defun unix-run-c4.5 (command &optional (file-stem (namestring *output-file*)))
  (shell (concatenate 'string "echo `time "
		      command " -f " file-stem " > " file-stem ".out` > "
                      file-stem ".time")))


(defun extract-c4.5-time (&optional (file (concatenate 'string (namestring *output-file*) ".time")))
  "Take the first number up to the letter u from the time file to indicate user time expended"
  (let ((last-line (last-file-line file)))
    (read-from-string (subseq last-line 0 (position #\u last-line)))))


(defun extract-c4.5-results (&optional (file (concatenate 'string (namestring *output-file*) ".out")))
  "Return the first two numbers on the last line of the output file for c4.5test as the tree-size
   and error"
  (let ((last-line (last-file-line file)))
    (multiple-value-bind (tree-size index)
	(read-from-string last-line)
      (values  tree-size (read-from-string (subseq last-line index))))))


(defun last-file-line (file)
  (with-open-file (in file :direction :input)
    (let (line next-line)
      (loop while (setf next-line (read-line in nil nil))
	    do (setf line next-line)
	    finally (return line)))))

;;;;--------------------------------------------------------------------------
;;; Allow running unpruned version by stealing training from pruned version

(setf (get 'c4.5-unpruned 'external) t)   ; declare C4.5 to be defined outside LISP
(setf (get 'c4.5-unpruned 'parameters) '(*c4.5-min-examples* *c4.5-prune-confidence*))
(setf (get 'c4.5-unpruned 'expect-training-error) t)   


(defun c4.5-run-before (system)
  (member system (member 'c4.5 *systems*)))


(defun prepare-c4.5-unpruned ()
  (unless (c4.5-run-before 'c4.5-unpruned)
    (prepare-c4.5)))


(defun prepare-trial-c4.5-unpruned (total-training-examples test-examples)
    (unless (c4.5-run-before 'c4.5-unpruned)
    (prepare-trial-c4.5 total-training-examples test-examples)))


(defun train-and-test-c4.5-unpruned (training-examples new-training-examples test-examples)
  (let ((*c4.5-prune-tree* nil))
    (if (c4.5-run-before 'c4.5-unpruned)
	(let ((result-data (copy-list (get 'c4.5 'result-data)))
	      (num-train (length training-examples))
	      (num-test (length test-examples)))
	  (format t "~%~%C4.5-UNPRUNED taking training from C4.5...Testing Training Data...")
	  (multiple-value-bind (tree-size train-errors)
	      (test-c4.5 t)
	    (set-field result-data 'train-accuracy (percent-correct train-errors num-train))
	    (set-field result-data 'concept-complexity tree-size))
	  (format t "Testing Test Data...")
	  (multiple-value-bind (tree-size test-errors)
	      (test-c4.5)
	    (declare (ignore tree-size))
	    (set-field result-data 'test-accuracy (percent-correct test-errors num-test))
	    (set-field result-data 'test-time (extract-c4.5-time)))
	  result-data)
	; else
	(train-and-test-c4.5 training-examples new-training-examples test-examples))))
    

;;;;--------------------------------------------------------------------------
;;; Rule version of C4.5. Allows stealing tree training from pruned version

(setf (get 'c4.5-rules 'external) t)   ; declare C4.5 to be defined outside LISP
(setf (get 'c4.5-rules 'parameters) '(*c4.5-rules-prune-confidence* *c4.5-rules-fisher-test*
				      *c4.5-rules-redundancy*))
(setf (get 'c4.5-rules 'expect-training-error) t)

(defparameter *c4.5-rules-prune-confidence* 25 "Confidence level for pruning rules")
(defparameter *c4.5-rules-fisher-test* nil "Use Fisher exact test to prune rules")
(defparameter *c4.5-rules-redundancy* 1  "Data redundancy for minimal encoding")


(defun prepare-c4.5-rules ()
  (unless (c4.5-run-before 'c4.5-rules)
    (prepare-c4.5)))


(defun prepare-trial-c4.5-rules (total-training-examples test-examples)
    (unless (c4.5-run-before 'c4.5-rules)
    (prepare-trial-c4.5 total-training-examples test-examples)))


(defun train-and-test-c4.5-rules (training-examples new-training-examples test-examples)
  (declare (ignore training-examples test-examples))
  (let ((train-time
	  (if (c4.5-run-before 'c4.5-rules)
	      (progn (format t "~%~%C4.5-RULES taking training from C4.5...")
		(get-field (get 'c4.5 'result-data) 'train-time))
	      (progn (format t "~%~%Training C4.5...")
		     (train-c4.5  new-training-examples)
		     (extract-c4.5-time))))
	(result-data (make-list (length *data-format*))))
    (format t "Extracting Rules...")
    (unix-run-c4.5 (concatenate 'string "c4.5rules -x"
			      (unless (= *c4.5-rules-prune-confidence* 25)
				(format nil " -c ~A" *c4.5-rules-prune-confidence*))
			      (if *c4.5-rules-fisher-test*
				  (format nil " -F"))
			      (unless (= *c4.5-rules-redundancy* 1)
				(format nil " -r ~A" *c4.5-rules-redundancy*))))
    (set-field result-data 'train-time (+ train-time (extract-c4.5-time)))
    (multiple-value-bind (literals accuracy)
	(extract-c4.5-rules-results)
      (set-field result-data 'train-accuracy accuracy)
      (set-field result-data 'concept-complexity literals))
    (format t "Testing Test Data...")
    (unix-run-c4.5 "c4.5rtest")
    (multiple-value-bind (literals accuracy)
	(extract-c4.5-rules-results)
      (declare (ignore literals))
      (set-field result-data 'test-accuracy accuracy)
      (set-field result-data 'test-time (extract-c4.5-time)))
    result-data
    ))


(defun extract-c4.5-rules-results (&optional (file (concatenate 'string (namestring *output-file*) ".out")))
  (multiple-value-bind (next-last-line last-line)
      (last-2-file-lines file)
    (let ((list (read-from-string (concatenate 'string "(" (delete #\, last-line) ")"))))
      (values (read-from-string next-last-line) (percent-correct (fourth list) (second list))))))

(defun last-2-file-lines (file)
  (with-open-file (in file :direction :input)
    (let (line next-line prev-line)
      (loop while (setf next-line (read-line in nil nil))
	    do (setf prev-line line)
	       (setf line next-line)
	    finally (return (values prev-line line))))))
