#!/usr/local/bin/perl

# Remove duplicates
# Sort into
# 1. All structural - with changes
# 2. All structural - no changes, passes up structure
# 3. Mixed
# 4. All lexical - use a lexicon

die "Missing name of grammar" if @ARGV == 0;

$srcgrammar = $ARGV[0];


open(IN, $srcgrammar) or die $!;
while ($line = <IN>) {
    if ($line =~ m/^\s*;/) {
	$comments .= $line;
    } elsif ($line =~ m/\{(\w+,\d+)\}/) {
	$ruleid = $1;

    } elsif ($line =~ m/->/) {
	$line =~ m/\[([^\]]+)\]\s*\-\>\s*\[([^\]]+)\]/;
	$src = $1; 
	$tgt = $2; 
	$src =~ s/\s*$//; $tgt =~ s/\s*$//;
	@srcs = split(/\s+/, $src);
	@tgts = split(/\s+/, $tgt);
	#print "Src $src Tgt $tgt\n";

	$alllex = 1;
	$allconst = 1;
	foreach $srctoken (@srcs) {
	    if ($srctoken =~ m/^\"/) {
		$allconst = 0;
	    } else {
		$alllex = 0;
	    }
	}

	foreach $tgttoken (@tgts) {
	    if ($tgttoken =~ m/^\"/) {
		$allconst = 0;
	    } else {
		$alllex = 0;
	    }
	}

	$allmatch = 1;
	if ($allconst == 0) {
	    $allmatch = 0;
	} elsif ($allconst == 1 and scalar(@srcs) == scalar(@tgts)) {
	    for ($i = 0; $i < scalar(@srcs); $i++) {
		if ($srcs[$i] ne $tgts[$i]) {
		    $allmatch = 0;
		}
	    }
	} elsif ($allconst == 1 and scalar(@srcs) != scalar(@tgts)) {
	    $allmatch = 0;
	}
	#print "$ruleid All match $allmatch, all const $allconst, ", scalar(@srcs), " ", scalar(@tgts), "\n";

	# get the rest of rule
	$rule = $line;
	$line = <IN>;
	while (!eof(IN)) {
	    $rule .= $line;
	    if ($line =~ m/^\)/) {
		last;
	    }
	    $line = <IN>;
	}

	if ($seenrules{$rule} == 1) {
	    # skip
	} else {
	    $seenrules{$rule} = 1;
	    $id2rule{$ruleid} = $comments . "{$ruleid}\n" . $rule;
	    if ($alllex == 1) {
		$alllex{$ruleid} = 1;
	    } elsif ($allmatch == 1 and $allconst == 1) {
		$allmatch{$ruleid} = 1;
	    } elsif ($allconst == 1) {
		$allconst{$ruleid} = 1;
	    } else {
		$mixed{$ruleid} = 1;
	    }
	}
	$comments = "";
    } else {

    }

}
close(IN);

sub byid {
    my($atype, $anum) = split(/,/, $a);
    my($btype, $bnum) = split(/,/, $b);
    $atype cmp $btype or
	$anum <=> $bnum;
}

open(SORT, "> sortedids.txt") or die $!;
print SORT "# All constituents, alters structure " . scalar(keys %allconst) . "\n\n";
foreach $ruleid (sort byid keys %allconst) {
    print SORT "$ruleid\n";
}
print SORT "\n# All constituents, structure maintained " . scalar(keys %allmatch) . "\n\n";
foreach $ruleid (sort byid keys %allmatch) {
    print SORT "$ruleid\n";
}
print SORT "\n# All lexical " . scalar(keys %alllex) . "\n\n";
foreach $ruleid (sort byid keys %alllex) {
    print SORT "$ruleid\n";
}
print SORT "\n# Constituents lexical mix " . scalar(keys %mixed) . "\n\n";
foreach $ruleid (sort byid keys %mixed) {
    print SORT "$ruleid\n";
}
close(SORT);


open(SORT, "> sortedrules.txt") or die $!;
print SORT "; -*- coding:cn-gb -*-\n";
print SORT "# All constituents, alters structure " . scalar(keys %allconst) . "\n\n";
foreach $ruleid (sort byid keys %allconst) {
    print SORT "$id2rule{$ruleid}\n";
}
print SORT "\n# All constituents, structure maintained " . scalar(keys %allmatch) . "\n\n";
foreach $ruleid (sort byid keys %allmatch) {
    print SORT "$id2rule{$ruleid}\n";
}
print SORT "\n# All lexical " . scalar(keys %alllex) . "\n\n";
foreach $ruleid (sort byid keys %alllex) {
    print SORT "$id2rule{$ruleid}\n";
}
print SORT "\n# Constituents lexical mix " . scalar(keys %mixed) . "\n\n";
foreach $ruleid (sort byid keys %mixed) {
    print SORT "$id2rule{$ruleid}\n";
}
close(SORT);
