(* 15-150, Spring 2024 *)
(* Michael Erdmann & Dilsun Kaynar *)
(* Code for Lecture 14: Regular Expressions *)
(************************************************************************)
(* Regular Expressions *)
datatype regexp =
Char of char
| Zero
| One
| Plus of regexp * regexp
| Times of regexp * regexp
| Star of regexp
(* match : regexp -> char list -> (char list -> bool) -> bool
REQUIRES: k is total.
ENSURES: match r cs k returns true,
if cs can be split as cs == p@s,
with p representing a string in L(r)
and k(s) evaluating to true;
match r cs k returns false, otherwise.
COMMENTS:
* Sometimes it is convenient to think of the ENSURES as:
match r cs k returns true
iff
cs can be split as cs == p@s,
with p representing a string in L(r)
and k(s) evaluating to true.
This is equivalent to the given ENSURES, so long as one knows
that (match r cs k) reduces to a value (which one can prove).
* There are some subtleties to proving that (match r cs k) reduces
to a value. For instance, a weaker REQUIRES for k than full
totality can simplify the proof. All one needs to know is that
k(cs') reduces to a value for all values cs' that are suffixes
of cs.
* Observe the strong similarity to our prefix function from
Lecture 12. Conceptually the specs for prefix and match are
very similar and you should take advantage of that similarity to
guide your intuition. However, match will actually do some
backtracking search, whereas prefix never needed to.
*)
(* We give three implementations of match below. *)
(* WARNING!
The first and second implementations below have an intentional error!
They may loop forever given a regular expression of the form Star(r)
if L(r) contains the empty string.
Ex: match (Star(One)) [#"a"] List.null
will loop forever.
There are two ways to fix this:
(1) We can require that all regular expressions be in standard form.
Doing so would exclude the counter example.
[See definition of "standard form" below.]
(2) We can check that the cs' passed to the continuation in the
clause for Star(r) gets smaller, i.e., is a proper suffix of cs.
We choose this option for our third implementation below.
[A regular expression r is in "standard form" iff for any occurrence
of Star(r1) within r, L(r1) does not contain the empty string.
It turns out: one can convert any regular expression r into a
different regular expression r' such that r' is in standard form
and L(r) = L(r'). See the notes by Bob Harper in regexp.pdf.]
*)
(* First implementation: *)
fun match (Char(a)) cs k = (case cs of
nil => false
| (c::cs') => (a=c) andalso (k cs'))
| match (Zero) _ _ = false
| match (One) cs k = k cs
| match (Plus(r1,r2)) cs k = match r1 cs k orelse match r2 cs k
| match (Times(r1,r2)) cs k = match r1 cs (fn cs' => match r2 cs' k)
| match (Star(r)) cs k = k cs orelse match r cs (fn cs' => match (Star(r)) cs' k)
(* Second implementation:
This implementation is extensionally equivalent to the first one, but
binds an identifier to the continuation used within the Star(r) clause.
*)
fun match (Char(a)) cs k = (case cs of
nil => false
| (c::cs') => (a=c) andalso (k cs'))
| match (Zero) _ _ = false
| match (One) cs k = k cs
| match (Plus(r1,r2)) cs k = match r1 cs k orelse match r2 cs k
| match (Times(r1,r2)) cs k = match r1 cs (fn cs' => match r2 cs' k)
| match (Star(r)) cs k =
let
fun matchrstar cs' = k cs' orelse match r cs' matchrstar
in
matchrstar cs
end
(* Third implementation: *)
(* properSuffix : char list * char list -> bool
REQUIRES: cs' is a suffix of cs
ENSURES: properSuffix (cs', cs) returns true if cs' is a
proper suffix of cs, and returns false otherwise.
*)
fun properSuffix (cs' : char list, cs : char list) : bool =
length cs' < length cs
fun match (Char(a)) cs k = (case cs of
nil => false
| (c::cs') => (a=c) andalso (k cs'))
| match (Zero) _ _ = false
| match (One) cs k = k cs
| match (Plus(r1,r2)) cs k = match r1 cs k orelse match r2 cs k
| match (Times(r1,r2)) cs k = match r1 cs (fn cs' => match r2 cs' k)
| match (Star(r)) cs k =
k cs orelse
match r cs (fn cs' => properSuffix(cs',cs) andalso match (Star(r)) cs' k)
(* accept : regexp -> string -> bool
REQUIRES: If we use the first or second implementation of match
given above, then we must require that r be in standard form.
For the third implementation of match, that is not necessary.
ENSURES: accept r s returns true, if s is in L(r);
accept r s returns false, otherwise.
*)
fun accept r s = match r (String.explode s) List.null
(* Examples *)
val Ca = Char(#"a")
val Cb = Char(#"b")
(* Below are five regular expression acceptors:
Regular Expression r1: aa
Regular Expression r2: (a+b)*
Regular Expression r3: (a+b)*aa(a+b)*
Regular Expression r4: (a+1)(b+ba)*
Regular Expression r5: (a + ab)(a + b)
Note: the language L(r3) consists of all strings over the alphabet
{a, b} that contain at least two consecutive a's, while the language
L(r4) consists of all strings that do not contain two consecutive a's.
*)
(* aa *)
val acceptor1 = accept (Times(Ca, Ca))
(* (a+b)* *)
val acceptor2 = accept (Star(Plus(Ca, Cb)))
(* (a+b)*aa(a+b)* *)
val acceptor3 = accept (Times(Times(Times(Star(Plus(Ca, Cb)), Ca), Ca),
Star(Plus(Ca, Cb))))
(* (a+1)(b+ba)* *)
val acceptor4 = accept (Times(Plus(Ca, One),Star(Plus(Cb, Times(Cb, Ca)))))
(* (a + ab)(a + b) *)
val acceptor5 = accept (Times(Plus(Ca, Times (Ca, Cb)), Plus (Ca, Cb)))
(* These can be used, for example, as follows: *)
val true = acceptor1 "aa" (* returns true *)
val false = acceptor1 "ab" (* returns false *)
val true = acceptor2 "abababb" (* returns true *)
val false = acceptor2 "abacabb" (* returns false *)
val true = acceptor3 "ababbbbabaaabbb" (* returns true *)
val false = acceptor4 "ababbbbabaaabbb" (* returns false *)
val false = acceptor3 "ababbbbabababbb" (* returns false *)
val true = acceptor4 "ababbbbabababbb" (* returns true *)
(************************************************************************)