;;;; -*- Scheme -*-
;;;; $Id: char-set.scm,v 1.4 1992/01/26 13:43:46 bevan Exp $

;;;+file-overview
;;;
;;; A simple implementation of Icon csets i.e. sets specialised for
;;; holding characters.  As always, this implementation is meant to be
;;; portable rather than very efficient.
;;;
;;;-file-overview
;;;+copyright
;;;
;;; Copyright (c) 1991 Department of Computer Science, University of Manchester
;;; All rights reserved.
;;;
;;;-copyright
;;;+author
;;;
;;; Stephen J. Bevan <bevan@cs.man.ac.uk>
;;;
;;;-author
;;;+implementation
;;;
;;; The character sets are represented as vectors of the form :-
;;;
;;;    #(number-of-characters characters     )
;;;      <      0           > < 1 to 128/256 >
;;;
;;; The first element contains the number of characters in the set,
;;; the rest  one element for each character i.e. it requires at least
;;; 128 or 256 bytes to hold a character set.
;;; This format provides constant time lookup, but wastes a lot of
;;; memory.  The best way to implement char-sets is via bit-vectors
;;; where one bit represents each character.  Using this method a
;;; whole 256 character set takes up only 32 bytes. (assuming 8 bit
;;; bytes).  If you Scheme has bit-vectors, or even bit operations on
;;; integers (fixnums), then I suggest you re-code the following using
;;; them. 
;;;
;;; Note this code has been written with too much knowledge of vectors
;;; built in.  Sometime, I should re-write this so that it has an
;;; abstract interface.  This would allow you to (fairly) easily
;;; change the implementation without having to modify all the
;;; functions.
;;;
;;;-implementation
;;;+extensions
;;;
;;; provide/require
;;;   a la old CommonLisp.  Just remove the line if you Scheme doesn't use
;;;   provide/require
;;;
;;; error
;;;   Most Schemes have one of these, if your doesn't then just
;;;   uncomment the one below
;;;
;;;-extensions

;;; If your system doesn't have "error", then try the following.  It should
;;; work ok for any errors produced in this file
;(define error
;  (lambda (function-error-occurs-in error-message-to-print . any-arguments)
;    (display "ERROR : ")
;    (display function-error-occurs-in)
;    (display " ")
;    (display error-message-to-print)
;    (for-each
;     (lambda (argument)
;       (display " ")
;       (display argument))
;     any-arguments)
;    (newline)
;    (let loop ()			; infinite loop
;      (loop))))			; to stop processing continuing.


;;; The number of characters that can be in a char-set + 1
;;; If you only want small char sets, you could alter this to 128.

(define char-set::vector-size 257)



;;; Returns true/false depending on whether there is a character at
;;; the given `position-to-look-at' in the `char-set-to-look-in'

(define char-set::at
  (lambda (char-set-to-look-in position-to-look-at)
    (vector-ref char-set-to-look-in position-to-look-at)))



;;;+fs
;;;
;;; Returns the number of characters in the `char-set'
;;;
;;;-fs

(define char-set:size
  (lambda (char-set)
    (vector-ref char-set 0)))



;;; Update the size of the char-set in place.

(define char-set::size!
  (lambda (char-set new-count)
    (vector-set! char-set 0 new-count)))



;;; Subract 1 (in place) from the number of characters in the `char-set'

(define char-set::1-size!
  (lambda (char-set)
    (char-set::size! char-set (- (char-set:size char-set) 1))))



;;; Add 1 (in place) to the number of characters in the `char-set'

(define char-set::1+size!
  (lambda (char-set)
    (char-set::size! char-set (+ (char-set:size char-set) 1))))



;;;+fs
;;;
;;; Returns a new char-set containing the complement of the
;;; `char-set-to-complement'.  This is equivalent do doing 
;;;
;;; > (char-set:difference char-set:all char-set-to-complement)
;;;
;;;-fs
;;; As you can see the above is how it is acutally implemented!

(define char-set:complement
  (lambda (char-set-to-complement)
    (char-set:difference char-set:all char-set-to-complement)))



;;;+fs
;;;
;;; Creates a copy of the `char-set-to-copy'
;;;
;;;-fs

(define char-set:copy
  (lambda (char-set-to-copy)
    (let ((new-vector (char-set:make)))
      (let loop ((current-position 0))
	(if (= current-position char-set::vector-size)
	    new-vector
	    (begin
	      (vector-set! new-vector
			   current-position
			   (char-set::at char-set-to-copy current-position))
	      (loop (+ 1 current-position))))))))



;;;+fs
;;;
;;; Returns a new char-set containing the difference of the
;;; `char-set-to-subtract-from' and the `char-set-to-subtract'
;;;
;;;-
;;;+fe
;;;
;;; For example if you difference all the alphanumeric characters and
;;; the alphabetic ones, you are left with just the digits.
;;;
;;; > (char-set:as-string
;;; >  (char-set:difference char-set:alphanumeric char-set:alphabetic)
;;; "0123456789"
;;;
;;;-

(define char-set:difference
  (lambda (char-set-to-subtract-from char-set-to-subtract)
    (let ((new-char-set (char-set:copy char-set-to-subtract-from)))
      (char-set:difference! new-char-set char-set-to-subtract)
      new-char-set)))



;;;+fs
;;;
;;; The same as `difference' except that it modifies its first
;;; argument.  Also there result of this is undefined.
;;;
;;;-fs
;;;+fe
;;;
;;; The following shows that the first argument is altered, but that
;;; the second is not.
;;; > (let ((a (char-set:from-range #\a #\d))
;;; >       (b (char-set:from-range #\b #\c)))
;;; >   (char-set:difference! a b)
;;; >   (list (char-set:to-string a) (char-set:to-string b)))
;;; ("ad" "bc")
;;;
;;;-fe

(define char-set:difference!
  (lambda (char-set-to-subtract-from char-set-to-subtract)
    (let loop ((position-in-sets (- char-set::vector-size 1)))
      (if (not (zero? position-in-sets))
	  (begin
	    (if (and (char-set::at char-set-to-subtract position-in-sets)
		     (char-set::at char-set-to-subtract-from position-in-sets))
		(begin
		  (vector-set! char-set-to-subtract-from position-in-sets #f)
		  (char-set::1-size! char-set-to-subtract-from)))
	    (loop (- position-in-sets 1)))))))



;;;+fs
;;;
;;; Returns #t if the `char-set' and the `other-char-set' contain the
;;; same characters.
;;;
;;;-fs

(define char-set:equal?
  (lambda (char-set other-char-set)
    (let loop ((position-in-sets (- char-set::vector-size 1))
	       (result (= (char-set:size char-set)
			  (char-set:size other-char-set))))
      (cond
       ((zero? position-in-sets) result)
       ((not result) result)
       (else
	(loop (- position-in-sets 1)
	      (eq? (char-set::at char-set position-in-sets)
		   (char-set::at other-char-set position-in-sets))))))))


;;;+fs
;;;
;;; Turns the `char-list-to-convert' into a char-set.
;;;
;;;-fs

(define char-set:from-list
  (lambda (char-list-to-convert)
    (let ((new-char-set (char-set:make)))
      (let loop ((l char-list-to-convert))
	(if (null? l)
	    new-char-set
	    (begin
	      (char-set:insert! new-char-set (car l))
	      (loop (cdr l))))))))



;;;+fs
;;;
;;; Turns the `integer-list-to-convert' into a char-set.
;;;
;;;-fs
(define char-set:from-integer-list
  (lambda (integer-list-to-convert)
    (let ((new-char-set (char-set:make)))
      (let loop ((l integer-list-to-convert))
	(if (null? l)
	    new-char-set
	    (begin
	      (char-set:insert! new-char-set (integer->char (car l)))
	      (loop (cdr l))))))))



;;;+fs
;;;
;;; Create a char-set using the given `lower-bound' and `upper-bound'
;;; as the range of characters to create.  Both of the bounds should
;;; be integers.  This allows you to specify characters that cannot be
;;; portably represented by the normal character representation i.e.
;;; the #\ notation.  (At least this is how I read the P1178/D4
;;; standard, ... you know, the one you are not supposed to claim
;;; conformance to :-)
;;;
;;;- +fe
;;;
;;; > (char-set:to-string (char-set:from-integer-range 48 57))
;;; "0123456789"
;;;
;;;-

(define char-set:from-integer-range
  (lambda (lower-bound upper-bound)
    (let ((new-char-set (char-set:make))
	  (end-position (+ 2 upper-bound)))
      (let loop ((current-position (+ 1 lower-bound)))
	(if (= current-position end-position)
	    new-char-set
	    (begin
	      (vector-set! new-char-set current-position #t)
	      (char-set::1+size! new-char-set)
	      (loop (+ 1 current-position))))))))



;;;+fs
;;;
;;; Create a char-set using the given `lower-bound' and `upper-bound'
;;; as the range of characters to create.  Both of the bounds should
;;; be characters and `lower-bound' <= `higher-bound'
;;;
;;;-fs
;;; +fe
;;;
;;; > (char-set:to-string (char-set:from-range #\a #\d))
;;; "abcd"
;;;
;;;-
(define char-set:from-range
  (lambda (lower-bound upper-bound)
    (let ((new-char-set (char-set:make))
	  (end-position (+ 2 (char->integer upper-bound))))
      (let loop ((current-position (+ 1 (char->integer lower-bound))))
	(if (= current-position end-position)
	    new-char-set
	    (begin
	      (vector-set! new-char-set current-position #t)
	      (char-set::1+size! new-char-set)
	      (loop (+ 1 current-position))))))))



;;;+fs
;;;
;;; Trun the `string-to-convert' into a char-set.
;;;
;;;-fs

(define char-set:from-string
  (lambda (string-to-convert)
    (let ((new-char-set (char-set:make))
	  (length-of-string-to-convert (string-length string-to-convert)))
      (let loop ((current-position 0))
	(if (= current-position length-of-string-to-convert)
	    new-char-set
	    (begin
	      (char-set:insert!
	       new-char-set
	       (string-ref string-to-convert current-position))
	      (loop (+ current-position 1))))))))



;;;+fs
;;;
;;; Returns a new character set containing all the characters in
;;; `char-set-to-add-to' as well as `char-to-add'
;;;
;;;- +fe
;;;
;;; The following example shows that `char-set-to-add-to' is not
;;; altered by the insertion of a character.
;;;
;;; > (let ((a (char-set:from-range #\a #\z)))
;;; >   (let ((b (char-set:insert a #\A)))
;;; >     (list (char-set:to-string a) (char-set:to-string b))))
;;; ("abcdefghijklmnopqrstuvwxyz" "Aabcdefghijklmnopqrstuvwxyz")
;;;
;;;-fs

(define char-set:insert
  (lambda (char-set-to-add-to char-to-add)
    (let ((new-char-set (char-set:copy char-set-to-add-to)))
      (char-set:insert! new-char-set char-to-add)
      new-char-set)))



;;;+fs
;;;
;;; The same ast `insert' except that it modifies `char-set-to-add-to'
;;; whilst adding `char-to-add'
;;; Returns : unspecified
;;;
;;;-fs

(define char-set:insert!
  (lambda (char-set-to-add-to char-to-add)
    (let ((position-of-char (+ 1 (char->integer char-to-add))))
      (if (not (char-set::at char-set-to-add-to position-of-char))
	  (char-set::1+size! char-set-to-add-to))
      (vector-set! char-set-to-add-to position-of-char #t))))



;;;+fs
;;;
;;; Create a new character set containg no characters.
;;; Returns : char-set
;;;
;;;-fs

(define char-set:make
  (lambda ()
    (let ((new-char-set (make-vector char-set::vector-size)))
      (let loop ((current-position (- char-set::vector-size 1)))
	(if (zero? current-position)
	    (begin
	      (char-set::size! new-char-set 0)
	      new-char-set)
	    (begin
	      (vector-set! new-char-set current-position #f)
	      (loop (- current-position 1))))))))



;;;+fs
;;;
;;; Returns true if the `char-to-look-for' is in the `char-set-to-look-in'
;;;
;;;-fs

(define char-set:member?
  (lambda (char-set-to-look-in char-to-look-for)
    (char-set::at char-set-to-look-in (+ 1 (char->integer char-to-look-for)))))



;;;+fs
;;;
;;; Turns the `char-set-to-convert' into a list of characters.
;;; The order of the characters in the list is undefined.
;;;
;;;-fs

(define char-set:to-list
  (lambda (char-set-to-convert)
    (let loop ((position-in-set (- char-set::vector-size 1))
	       (char-set-as-list '()))
      (if (zero? position-in-set)
	  char-set-as-list
	  (loop (- position-in-set 1)
		(if (char-set::at char-set-to-convert position-in-set)
		    (cons (integer->char (- position-in-set 1))
			  char-set-as-list)
		    char-set-as-list))))))

;;;+fs
;;;
;;; Returns a string containing the characters in the `char-set-to-convert'
;;;
;;;-fs

(define char-set:to-string
  (lambda (char-set-to-convert)
    (let* ((size-of-set (char-set:size char-set-to-convert))
	   (char-set-as-string (make-string size-of-set)))
      (let loop ((position-in-set (- char-set::vector-size 1))
		 (position-in-string (- size-of-set 1)))
	(if (zero? position-in-set)
	    char-set-as-string
	    (if (char-set::at char-set-to-convert position-in-set)
		(begin
		  (string-set! char-set-as-string
			       position-in-string
			       (integer->char (- position-in-set 1)))
		  (loop (- position-in-set 1) (- position-in-string 1)))
		(loop (- position-in-set 1) position-in-string)))))))



;;;+fs
;;;
;;; Create a new char-set that is a union of the `first-char-set' and
;;; the `second-char-set' i.e. the new char-set will contain all the
;;; characters that were in both of the input sets.
;;;
;;;-fs

(define char-set:union
  (lambda (first-char-set second-char-set)
    (let ((new-char-set (char-set:copy first-char-set)))
      (char-set:union! new-char-set second-char-set)
      new-char-set)))



;;;+fs
;;;
;;; The same as `union' except that it modifies its first argument
;;; i.e. all the characters in the `second-char-set' are added to the
;;; `first-char-set' in place.
;;; Returns : unspecified.
;;;
;;;-fs

(define char-set:union!
  (lambda (first-char-set second-char-set)
    (let loop ((position-in-sets (- char-set::vector-size 1)))
      (if (not (zero? position-in-sets))
	  (begin
	    (if (and (char-set::at second-char-set position-in-sets)
		     (not (char-set::at first-char-set position-in-sets)))
		(begin
		  (vector-set! first-char-set position-in-sets #t)
		  (char-set::1+size! first-char-set)))
	    (loop (- position-in-sets 1)))))))



;;; Some standard characters sets that you might find useful.

;;; All the characters in the character set.
(define char-set:all
  (char-set:from-integer-range 0 255))

(define char-set:digits
  (char-set:from-range #\0 #\9))

(define char-set:lower-case
  (char-set:from-range #\a #\z))

(define char-set:upper-case
  (char-set:from-range #\A #\Z))

(define char-set:alphabetic
  (char-set:union char-set:lower-case char-set:upper-case))

(define char-set:alphanumeric
  (char-set:union char-set:alphabetic char-set:digits))

(define char-set:whitespace
  (char-set:from-integer-list '(9 10 13 32)))

;(provide 'char-set)

;;;+file-examples

;;; A test for all (some?) of the functions defined in the file.
;;;
(define (char-set:test:all)

  (define (char-set:test:s-eq test-name a b)
    (if (not (string=? a b))
	(error test-name a b)))
  (define (char-set:test:n-eq test-name a b)
    (if (not (= a b))
	(error test-name a b)))
  (define (char-set:test:b-eq test-name a b)
    (if (not (eq? a b))
	(error test-name a b)))
  (define (char-set:test:set-eq test-name a b)
    (if (not (char-set:equal? a b))
	(error test-name a b)))
  (define (char-set:test:l-eq test-name comparison a b)
    (cond
     ((null? a) (or (null? b) (error test-name a b)))
     ((null? b) (or (null? a) (error test-name a b)))
     (else (and (comparison (car a) (car b))
		(char-set:test:l-eq test-name comparison (cdr a) (cdr b))))))
  (define (char-set:test:nl-eq test-name a b)
    (char-set:test:l-eq test-name = a b))
  (define (char-set:test:sl-eq test-name a b)
    (char-set:test:l-eq test-name string=? a b))

  (char-set:test:s-eq 'to-string-a
   (char-set:to-string char-set:digits) "0123456789")

  (char-set:test:b-eq 'equal-a
    (char-set:equal? char-set:digits (char-set:copy char-set:digits))
    #t)

  (char-set:test:set-eq 'from-range-a
   char-set:digits (char-set:from-range #\0 #\9))
   
  (char-set:test:set-eq 'difference-a
    (char-set:difference char-set:alphanumeric char-set:alphabetic)
    char-set:digits)

  (char-set:test:sl-eq 'difference!-a
    (let ((a (char-set:from-range #\a #\d))
	  (b (char-set:from-range #\b #\c)))
      (char-set:difference! a b)
      (list (char-set:to-string a) (char-set:to-string b)))
    '("ad" "bc"))

  (char-set:test:sl-eq 'insert-a
    (let ((a (char-set:from-range #\a #\z)))
      (let ((b (char-set:insert a #\A)))
       (list (char-set:to-string a) (char-set:to-string b))))
    '("abcdefghijklmnopqrstuvwxyz" "Aabcdefghijklmnopqrstuvwxyz"))

  (char-set:test:b-eq 'member-a
    (char-set:member? char-set:digits #\0) #t)
  (char-set:test:b-eq 'member-b
    (char-set:member? char-set:digits #\a) #f)

  (display "all tests passed") (newline))

;;;-file-examples
