#!/usr/local/bin/perl58

# Extract the words in transfer lexicon for use in segmenter

if (@ARGV == 0) {
    $initfile = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Chinese/xfer3.ini";
} else {
    $initfile = $ARGV[0];
}

@lexfiles = ();

open(INIT, "$initfile") or die $!;
while ($line = <INIT>) {
    $line =~ s/[\r\n]*$//;
    if ($line =~ m/^loadlex/) {
	$line =~ s/\s*;.*$//;
	$line =~ m/^loadlex\s+(.*)\s*$/;
	$lexfile = $1;
	#print "Lex file: $lexfile\n";
	push @lexfiles, $lexfile;
    }
}
close(INIT);

foreach $file (@lexfiles) {
    print "Reading $file\n";
    open(FD, "<:encoding(gbk)", $file) or die $!;
    while ($line = <FD>) {
	next if $line =~ m/^\s*;/;
	if ($line =~ m/\|:/) {
	    ($chinese) = ($line =~ m/\|:\s*\[(?:\")?([^\"\] ]+)(?:\")?\]/);
	    next if $chinese eq "";
	    $centries{$chinese} = 1;
	    #@cwords = split(/\s+/, $chinese);
	    #foreach $cword (@cwords) {
		#$cword =~ s/^\"//;
		#$cword =~ s/\"$//;
		#$centries{$cword} = 1;
	    #}
	}
    }
    close(FD);
}

$wordfile = "lexlist-all.txt";
open(WL, ">:utf8", $wordfile) or die $!;
$count = 0;
foreach $entry (sort keys %centries) {
    next if $entry =~ m/\P{Han}/;
    print WL "$entry\n";
    $count++;
}
close(WL);

print "Total words $count written to $wordfile\n";
