%----------------------------------------------- Implemented by Geoff Langdale, 1995. This file contains an implementation of the algorithm described in: Uzi Vishkin. Optimal parallel matchin in strings. Information and Control, 67, pages 91--113, 1985. and presented in Joseph Jaja. An Introduction to Parallel Algorithms Addison Wesley, 1992 -------------------------------------------------- % % bfw takes a pattern and a index and returns the witness value, by shifting a copy of the pattern by the index and comparing each character position. % function bfw(P,j) = let len = #P-j; a = subseq(P, 0, len); b = subseq(P, j, j+len); in find(F, { a == b: a; b }); % Takes a pattern P, i and j st. j>i, a witness array W such that W[j-i] is filled in. Works out either witness(i) or witness(j) and returns ((winner, value), loser). Evil incantations, really. % function witness_duel(P, W, i, j) = let k = W[j-i]; in if (k <= #P-j) then ( if (P[k+j] /= P[k]) then ((j, k) , i) else ((i, k+j-i), j) ) else ( if (P[k-i] /= P[k+j-i]) then ((j, k-i), i) else ((i, k-i), j) ); % Witness_rec returns a completed witness array and the length of the period if the string is periodic; otherwise -1 % function witness_rec(P, W, unfinished) = let j = unfinished[0]; wit_j = bfw(P,j); W = W<-[(j,wit_j)]; in if (wit_j == -1) then % P is periodic % (W, j) else ( if (#unfinished == 1) then % P is not periodic, out of candidates % (W, -1) else let unfinished = drop(unfinished, 1); % drop j % % pair everthing off, drop off an odd-man-out % odds = odd_elts(unfinished); evens = even_elts(unfinished); (odd_one_out, evens) = if (#evens /= #odds) then ([evens[#evens-1]],take(evens, #evens-1)) else ([] int, evens); triples = { witness_duel(P,W,e,o) : e in evens; o in odds }; % merge in the sucessfully calculated witness values from the witness_duels % W = W <- { wv_pair : (wv_pair, loser) in triples }; % recurse with the indices that do not yet have witness values calculated % unfinished = { loser : (wv_pair,loser) in triples } ++ odd_one_out; in witness_rec(P, W, unfinished)); % Calculate the witness function for P % function witness(P) = let W = dist(0,(#P+1)/2+1); % Pad this a bit to avoid annoying cases % (W, period) = witness_rec(P, W, [1:#W:1]); periodic = if ((#P == 1) or (period == -1)) then f else t; in (W, period, periodic); function test_witness(P,W) = % testing only % drop ({ if W[i] == -1 then f else ( P[W[i]] == P[i+W[i]] ) : i in [0:#W:1] } , 1); % Use brute force to eliminate possible matches % % depth is constant, but work is number of candidates times size of P % function brute_force(S, P, candidates) = { c in candidates | if ((c+#P)<=#S) then eql(P, subseq(S, c, c+#P)) else f}; % duel: given two possible locations for P in T (i and j) eliminate one % function duel(S, P, W, i, j) = let k = W[j-i]; in if (j+k<#S) then (if S[j+k] /= P[k] then i else j) else i; % Take a witness array, a text, a pattern, start and end points % % and return the potential location of a match (if any) % function mass_duel(S, P, W, start, end) = let size = end-start; in if (size == 1) then start else duel(S, P, W, mass_duel(S, P, W, start, (start+end)/2), mass_duel(S, P, W, (start+end)/2, end)); % String is not periodic. Return array of starting points.% function real_non_periodic(S, P, W) = let bstart = [0:#S:#P/2]; bend = subseq(bstart, 1, #bstart) ++ [#S]; candidates = { mass_duel(S, P, W, bstart, bend): bstart ; bend }; in brute_force(S, P, candidates); function non_periodic(S, P, W) = if (#P == 1) then { i in [0:#S:1] ; s | s == p[0]} else real_non_periodic(S,P,W); % String is periodic. Return array of starting points.% % This one is dirtier. First we use non_periodic to find the largest non- periodic prefix of P in S. % function do_periodic(S, P, period, W) = let cands = non_periodic(S, subseq(P, 0, 2*period-1), W); u = subseq(P, 0, period); v = subseq(P, #P - rem(#P,period) , #P); k = #P/period; nump = if (rem(#S,period) == 0) then #S/period else #S/period+1; u2v = u ++ u ++ v; % look for u2v at each location % cands = brute_force(S, u2v, cands); % M becomes a vector with 1 at every index which has a `uuv' sequence. % % M is larger than #S by period - this is for convenience using plus_scan later on (plus_scan is a prescan, and we want to see the results of adding the last element % M = dist(0, nump*period+period) <- { (c,1): c in cands }; % p_apart is a series of lists along the lines of [M[0] M[p] M[2p] ... ] [M[1] M[p+1] M[2p+1] ... ] ... ] sums is a plus_scan on these p_aparts % p_apart = transpose(partition(M, dist(period, nump+1))); sums = { plus_scan(p_apart) : p_apart }; % We find the runs of k-1 1's by comparing a the value at a given index i the the one at i+k - if the difference is k, then all values were 1: therefore there is a string of solid ones. We then transform the set of lists back from [ [ 0 p 2p 3p ...] [1 p+1 2p+1 ...] ... ] to [0 1 2 3 ...] form % k_runs = flatten(transpose( { { (ma-mb) == k - 1 : ma in m4els; mb in m5els }: m4els in { drop(sums,k-1): sums }; m5els in { subseq(sums, 0, #sums-(k-1)) : sums} })); in % and then return the values which have a run of k uuv's (in other words, u....u (k times) followed by v, which is the periodic string we're looking for. % { i in [0:#k_runs:1] ; M in k_runs | M}; % Analyse text. Return an array of the starting points of P in string S.% function text_analysis(S, P) = let (witness, period, periodic) = witness(P); in if (#S < #P) then [] int else ( if (periodic) then do_periodic(S, P, period, witness) else non_periodic(S, P, witness));