#!/usr/local/bin/perl -w

if (scalar(@ARGV) == 0) {
    print "Need one or more lexicon file names\n";
    exit;
}

@lexfiles = ();
for ($i = 0; $i < @ARGV; $i++) {
    if ($ARGV[$i] eq "-i") { # Read from init file
	open(INI, $ARGV[$i+1]) or die $!;
	while ($line = <INI>) {
	    if ($line =~ m/^\s*loadlex/) {
		$line =~ s/;[^\"\n]*$//;
		$line =~ m/^\s*loadlex\s+(\S+)\s*$/;
		$lexfile = $1;
		$lexfile =~ s/\.ids$//;
		push @lexfiles, $lexfile;
	    }
	}
	close(INI);
	$i++;
    } else {
	push @lexfiles, $ARGV[$i];
    }
}

$duplicates = 0; $rules = 0; $emptysource = 0;
$compressed = "";

# Do a first pass to 
foreach $lexicon (@lexfiles) {
    $entries = "";
    print "Processing $lexicon\n";

    open(DICT, $lexicon) or die $!;
    while ($line = <DICT>) {
	next if $line =~ m/^\s*;/;
	if ($line =~ m/\|:/) {
	    $line =~ s/[\r\n]*$//;
	    $line =~ s/;[^\"\n]*$//;
	    $line =~ s/\s+/ /g;
	    $line =~ s/^\s*//;
	    $line =~ s/\s*$//;
	    $line =~ s/\-\-\-[^\"\]]+//;

	    if ($line =~ m/^\:\:/) {  
		$line =~ s/^\:\:/LEX\:\:LEX/;
	    }

	    if ($line =~ m/^\s*$/ or $line =~ m/^LEX\:\:/ or $line =~ m/\|\: \[\"\"\]/) { 
		next;
	    } else {
		#print "$line\n";
		$line =~ s/^([^\:]+)\:\:(\S+)/LEX\:\:LEX/;
		$haspos{uc($line)} = 1;
	    }
	}
    }
    close(DICT);
}

@haspos = keys %haspos;
print "Total entries with POS: ", scalar(@haspos), "\n";

#exit;

foreach $lexicon (@lexfiles) {
    $entries = "";
    print "Processing $lexicon\n";

    open(DICT, $lexicon) or die $!;
    while ($line = <DICT>) {
	next if $line =~ m/^\s*;/;
	$line =~ s/;[^\"]*$//;
	$line =~ s/\s+/ /g;
	$line =~ s/^\s*//;
	$line =~ s/\s*$//;
	if ($line =~ m/\|:/) {
	    $compressed = uc($entry);
	    $compressed =~ s/;[^\"\n]*$//g;
	    $compressed =~ s/[\r\n]+/ /g;
	    $compressed =~ s/\s+/ /g;
	    $compressed =~ s/^\s*//;
	    $compressed =~ s/\s*$//;
	    $compressed =~ s/\-\-\-[^\"\]]+//;
	    
	    if ($compressed !~ m/^\s*$/) {
		#$compressed =~ s/([\(\)\-\/\<\>\$\|\*])/\\$1/g;
		if ($compressed =~ m/\|\: \[\"\"\]/ and $compressed =~ m/^\s*LEX/) {
		    $emptysource++;
		} elsif ($skip == 1) {
		    #print "$compressed\n";
		    $lexskipped++; $skip = 0;
		} elsif (!defined($compentries{$compressed})) {
		    $rules++;
		    $entries .= $ruleid . "\n" . $entry . "\n";
		    $compentries{$compressed} = 1;
		} else {
		    $duplicates++;
		}
	    }
	    $skip = 0;
	    if ($line =~ m/^\:\:/) {  
		$line =~ s/^\:\:/LEX\:\:LEX/;
	    }


	    if (defined($haspos{uc($line)})) {
		$skip = 1;
	    }

	    ($ruletype) = ($line =~ m/^\s*([^\:]+)::/);
	    $ruletype = uc($ruletype);
	    $rulenumber{$ruletype}++;
	    $ruleid = "{" . $ruletype . "," . $rulenumber{$ruletype} . "}";
	    $line =~ s/[\r\n]*$//;
	    $entry = $line . "\n";
	} elsif ($line !~ m/^\s*$/) {
	    $line =~ s/^\s*\{[^,]+,[^\}]+\}//;
	    $line =~ s/[\r\n]*$//;
	    $entry .= $line . "\n";
	}
    }

    if ($entry !~ m/^\s*$/) {
	$compressed = uc($entry);
	$compressed =~ s/;[^\"\n]*$//g;
	$compressed =~ s/\n/ /g;
	$compressed =~ s/\s+/ /g;
	$compressed =~ s/^\s*//;
	$compressed =~ s/\s*$//;
	$compressed =~ s/\-\-\-[^\"\]]+//;
	
	if ($compressed !~ m/^\s*$/) {
	    #$compressed =~ s/([\(\)\-\/\<\>\$\|\*])/\\$1/g;
	    if ($compressed =~ m/\|\: \[\"\"\]/ and $compressed =~ m/^\s*LEX/) {
		$emptysource++;
	    } elsif ($skip == 1) {
		$lexskipped++;
	    } elsif (!defined($compentries{$compressed})) {
		$rules++;
		$entries .= $ruleid . "\n" . $entry . "\n";
		$compentries{$compressed} = 1;
	    } else {
		$duplicates++;
	    }
	}
	$entry = "";
    }

    close(DICT);

    $newname = $lexicon . ".ids";
    open(NEW, "> $newname") or die $!;
    print NEW $entries;
    close(NEW);

}

print "Unique: $rules; Duplicates: $duplicates; Empty source $emptysource; LEX dups $lexskipped\n";
