#!/usr/local/bin/perl58

# Extract the words in transfer lexicon for use in segmenter

if (@ARGV == 0) {
    $initfile = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Chinese/xfer3.ini";
} else {
    $initfile = $ARGV[0];
}

@lexfiles = ();

open(INIT, "$initfile") or die $!;
while ($line = <INIT>) {
    $line =~ s/[\r\n]*$//;
    if ($line =~ m/^loadlex/) {
	$line =~ s/\s*;.*$//;
	$line =~ m/^loadlex\s+(.*)\s*$/;
	$lexfile = $1;
	#print "Lex file: $lexfile\n";
	push @lexfiles, $lexfile;
    }
}
close(INIT);

open(LEX, ">:utf8", "xferlexicon.txt");
open(ENG, ">:utf8", "xferEnglish.txt");

foreach $file (@lexfiles) {
    print "Reading $file\n";
    open(FD, "<:encoding(gbk)", $file) or die $!;
    while ($line = <FD>) {
	next if $line =~ m/^\s*;/;
	if ($line =~ m/\|:/) {
	    ($src) = ($line =~ m/\|:\s*\[(?:\")?([^\"\] ]+)(?:\")?\]/);
	    next if $src eq "";

	    ($tgt) = ($line =~ m/\->\s*\[(?:\")?([^\"\] ]+)(?:\")?\]/);
	    next if $tgt eq "";

	    print LEX "$src\t$tgt\n";
	    print ENG "$tgt\n";
	}
    }
    close(FD);
}

exit;


print "Total words $count written to $wordfile\n";
