#!/usr/local/bin/perl

require 'f:/tides/segmenter/segmenter.pl';

open(LDC, 'd:/cmu/cebmt/lexicons/ldc-dic.v3beta.withfreq.sorted') or die $!;
$count = 0;
while ($line = <LDC>) {
    last if $count == 10000;
    $line =~ s/[\n\r]*$//;
    ($chinese, $english) = split(/\s/, $line, 2);
    $ldcgloss{$chinese} = $english;

    $count++;
}

#open(SEG, "mt02_chinese_evlset_v0.seg-Mandarin.gb") or die $!;
open(SEG, "993.seg-mandarin.gb") or die $!;
while ($line = <SEG>) {
    $line =~ s/[\r\n ]*$//;
    @words = split(/\s+/, $line);
    foreach $word (@words) {
	if (length($word) >= 4 and &allforeign($word) and !defined($ldcgloss{$word})) {
	    $transwords{$word}++;
	}
    }
}

foreach $word (sort keys %transwords) {
    $total += $transwords{$word};
    print "$word\t$ldcgloss{$word}\n";
}

print STDERR "Total $total\n";
