%-----------------------------------------------
Implemented by Geoff Langdale, 1995.
This file contains an implementation of the algorithm described in:
Uzi Vishkin.
Optimal parallel matchin in strings.
Information and Control, 67, pages 91--113, 1985.
and presented in
Joseph Jaja.
An Introduction to Parallel Algorithms
Addison Wesley, 1992
-------------------------------------------------- %
% bfw takes a pattern and a index and returns the witness value,
by shifting a copy of the pattern by the index and comparing
each character position. %
function bfw(P,j) =
let
len = #P-j;
a = subseq(P, 0, len);
b = subseq(P, j, j+len);
in
find(F, { a == b: a; b });
% Takes a pattern P, i and j st. j>i, a witness array W such that
W[j-i] is filled in. Works out either witness(i) or witness(j)
and returns ((winner, value), loser). Evil incantations, really. %
function witness_duel(P, W, i, j) =
let
k = W[j-i];
in
if (k <= #P-j) then (
if (P[k+j] /= P[k]) then
((j, k) , i)
else
((i, k+j-i), j)
) else (
if (P[k-i] /= P[k+j-i]) then
((j, k-i), i)
else
((i, k-i), j)
);
% Witness_rec returns a completed witness array and the length of
the period if the string is periodic; otherwise -1 %
function witness_rec(P, W, unfinished) =
let
j = unfinished[0];
wit_j = bfw(P,j);
W = W<-[(j,wit_j)];
in
if (wit_j == -1) then % P is periodic %
(W, j)
else (
if (#unfinished == 1) then % P is not periodic, out of candidates %
(W, -1)
else
let
unfinished = drop(unfinished, 1); % drop j %
% pair everthing off, drop off an odd-man-out %
odds = odd_elts(unfinished);
evens = even_elts(unfinished);
(odd_one_out, evens) =
if (#evens /= #odds) then
([evens[#evens-1]],take(evens, #evens-1))
else
([] int, evens);
triples = { witness_duel(P,W,e,o) : e in evens; o in odds };
% merge in the sucessfully calculated witness
values from the witness_duels %
W = W <- { wv_pair : (wv_pair, loser) in triples };
% recurse with the indices that do not yet have
witness values calculated %
unfinished = { loser : (wv_pair,loser) in triples } ++ odd_one_out;
in
witness_rec(P, W, unfinished));
% Calculate the witness function for P %
function witness(P) =
let
W = dist(0,(#P+1)/2+1); % Pad this a bit to avoid annoying cases %
(W, period) = witness_rec(P, W, [1:#W:1]);
periodic = if ((#P == 1) or (period == -1)) then f else t;
in
(W, period, periodic);
function test_witness(P,W) = % testing only %
drop ({ if W[i] == -1 then f else ( P[W[i]] == P[i+W[i]] ) : i in [0:#W:1] } , 1);
% Use brute force to eliminate possible matches %
% depth is constant, but work is number of candidates times size of P %
function brute_force(S, P, candidates) =
{ c in candidates | if ((c+#P)<=#S) then eql(P, subseq(S, c, c+#P)) else f};
% duel: given two possible locations for P in T (i and j) eliminate one %
function duel(S, P, W, i, j) =
let
k = W[j-i];
in
if (j+k<#S) then (if S[j+k] /= P[k] then i else j) else i;
% Take a witness array, a text, a pattern, start and end points %
% and return the potential location of a match (if any) %
function mass_duel(S, P, W, start, end) =
let
size = end-start;
in
if (size == 1) then
start
else
duel(S, P, W, mass_duel(S, P, W, start, (start+end)/2),
mass_duel(S, P, W, (start+end)/2, end));
% String is not periodic. Return array of starting points.%
function real_non_periodic(S, P, W) =
let
bstart = [0:#S:#P/2];
bend = subseq(bstart, 1, #bstart) ++ [#S];
candidates = { mass_duel(S, P, W, bstart, bend): bstart ; bend };
in
brute_force(S, P, candidates);
function non_periodic(S, P, W) =
if (#P == 1) then
{ i in [0:#S:1] ; s | s == p[0]}
else
real_non_periodic(S,P,W);
% String is periodic. Return array of starting points.%
% This one is dirtier. First we use non_periodic to find the largest non-
periodic prefix of P in S. %
function do_periodic(S, P, period, W) =
let
cands = non_periodic(S, subseq(P, 0, 2*period-1), W);
u = subseq(P, 0, period);
v = subseq(P, #P - rem(#P,period) , #P);
k = #P/period;
nump = if (rem(#S,period) == 0) then #S/period else #S/period+1;
u2v = u ++ u ++ v;
% look for u2v at each location %
cands = brute_force(S, u2v, cands);
% M becomes a vector with 1 at every index which has a `uuv' sequence. %
% M is larger than #S by period - this is for convenience using
plus_scan later on (plus_scan is a prescan, and we want to see the
results of adding the last element %
M = dist(0, nump*period+period) <- { (c,1): c in cands };
% p_apart is a series of lists along the lines of
[M[0] M[p] M[2p] ... ] [M[1] M[p+1] M[2p+1] ... ] ... ]
sums is a plus_scan on these p_aparts %
p_apart = transpose(partition(M, dist(period, nump+1)));
sums = { plus_scan(p_apart) : p_apart };
% We find the runs of k-1 1's by comparing a the value at a given index
i the the one at i+k - if the difference is k, then all values were
1: therefore there is a string of solid ones. We then transform the
set of lists back from [ [ 0 p 2p 3p ...] [1 p+1 2p+1 ...] ... ] to
[0 1 2 3 ...] form %
k_runs = flatten(transpose( {
{ (ma-mb) == k - 1 : ma in m4els; mb in m5els }:
m4els in { drop(sums,k-1): sums };
m5els in { subseq(sums, 0, #sums-(k-1)) : sums} }));
in
% and then return the values which have a run of k uuv's (in other words,
u....u (k times) followed by v, which is the periodic string we're
looking for. %
{ i in [0:#k_runs:1] ; M in k_runs | M};
% Analyse text. Return an array of the starting points of P in string S.%
function text_analysis(S, P) =
let
(witness, period, periodic) = witness(P);
in
if (#S < #P) then
[] int
else (
if (periodic) then
do_periodic(S, P, period, witness)
else
non_periodic(S, P, witness));