#!/usr/bin/perl -w

# Aug 1, 2007
# Look at the lexical probability file and see how well it covers the
# words in a corpus and the words in the segmentation list

binmode(STDOUT, ":utf8");

open(SEG, "<:utf8", "lexlist-withwiki.txt") or die $!;
while ($line = <SEG>) {
    chomp($line);
    $llcount++;
    $cseg{$line} = 1;
}
close(SEG);


#open(LEX, "<:utf8", "/shared/data/Chinese/giza-large/GIZA.fge") or die $!;
open(LEX, "<:utf8", "/shared/data/Chinese/giza-large/GIZA.egf") or die $!;
#open(LEX, "<:encoding(gbk)", "/shared/data/Chinese/GIZA.fge") or die $!;
#open(LEX, "<:encoding(gbk)", "/shared/data/Chinese/GIZA.egf") or die $!;
while ($line = <LEX>) {
    ($english, $chinese, $prob) = split(/\s+/, $line);
    $clexprobs{$chinese} = 1;
}
close(LEX);

$probcount = scalar(keys %clexprobs);

foreach $clexprob (keys %clexprobs) {
    if (!defined($cseg{$clexprob}) and $clexprob =~ m/^\p{Han}+$/) {
	#print $clexprob, "\n";
	$diffcount++;
    }
}

open(SEG, "<:encoding(gbk)", "mt03_chinese_evlset_v0.seg") or die $!;
while ($line = <SEG>) {
    chomp($line);
    @cwords = split(/\s+/, $line);
    foreach $cword (@cwords) {
	$srcwords{$cword}++;
    }
}
close(SEG);

$hypdiffcount = 0;
foreach $ctype (keys %srcwords) {
    $hypcount++;
    if (!defined($cseg{$ctype}) and defined($clexprobs{$ctype})) {   #  
	$hypdiffcount++;
	print "$ctype\n";
    }
}


=comment
$hypfile = "/shared/data/Chinese/mt03_chinese_evlset_v0-xfer3-070731-1.txt";
open(HYP, "<:encoding(gbk)", $hypfile) or die $!;
while ($line = <HYP>) {
    if ($line =~ m/^\(\s*\d+\s+\d+\s+\"[^\"]*\"\s+\S+\s+\"([^\"]*)\"/) {
	$chinese = $1; $chinese =~ s/\s*$//;
	@cwords = split(/\s+/, $chinese);
	foreach $cword (@cwords) {
	    $ctypes{$cword} = 1;
	}
    }
}
close(HYP);

foreach $ctype (keys %ctypes) {
    $hypcount++;
    if (!defined($clexprob{$ctype})) {
	$hypdiffcount++;
	print "$ctype\n";
    }
}
=cut

print STDERR "Lex prob $probcount, Seg count $llcount, Seg types not in lex probs $diffcount\n";
print STDERR "Hyp type count $hypcount, Hyp types not in xfer lex but in lex probs: $hypdiffcount\n";
