#!/usr/local/bin/perl

$transfile = "/usr0/eepeter/Chinese/ctb_np_uniq2.out";

$chinese = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Chinese/small-june02.gb";
$english = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Chinese/small-june02.en";

# Load Chinese source
open(CHN, $chinese) or die $!;
while ($line = <CHN>) {
    $line =~ s/[\r\n]*$//;
    push @chinesesents, $line;
}
close(CHN);

# Load English reference translation
open(ENG, $english) or die $!;
while ($line = <ENG>) {
    $line =~ s/[\r\n]*$//;
    push @englishsents, $line;
}
close(ENG);

# Read in translations, scores and traces
open(NP, $transfile) or die $!;
while (!eof(NP)) {
    $trans = <NP>; chomp($trans);
    $scores = <NP>; chomp($scores);
    $scores =~ m/Prob: ([^,]+)/;
    $prob = $1;

    # Read in trace, arc by arc
    $trace = <NP>; chomp($trace);
    $src = "";
    while ($trace !~ m/^\s*$/) {
	$trace =~ m/: ([^\(]+) \(/;
	$src .= " " . $1;
	$trace = <NP>;
    }
    $line = <NP>;

    $src =~ s/^\s//;
    $src =~ s/\s$//;
    $src =~ s/\s\s/ /g;

    next if $src !~ m/\s/;
    next if $trans =~ m/[\x80-\xff]/;

    # Calculate an n-gram precision match against reference translation
    # First, find matching Chinese source
    $chinesematch = -1;
    for ($i = 0; $i < @chinesesents; $i++) {
	if ($chinesesents[$i] =~ m/$src/) {
	    $chinesematch = $i;
	    last;
	}
    }

    if ($chinesematch != -1) {
	#print "Matching $chinesematch\n";
	# Do n-gram match of translation against English reference
	$ngscore = scoreArc($trans, $englishsents[$i]);
	#print "N-gram score: $ngscore\n";
	if ($ngscore < 0.5) { next; }
    } else { #print "No match\n"; 
	     next; }

    push @nps, "$prob\t$trans\n\t$src";

}

sub byprob {
    my($aprob, $atrans, $asrc) = split(/\t/, $a);
    my($bprob, $btrans, $bsrc) = split(/\t/, $b);
    $bprob <=> $aprob or
	$atrans cmp $btrans;
}

@sortednps = sort byprob @nps;


foreach $np (@sortednps) {
    print "$np\n";

}

sub scoreArc {
    my($arc, $ref) = @_;
    my($TotalNGramIndex, $ArcLength, $TotalNGrams, $ArcScore);
    my($i, $j, $n);
    my(@arcwords) = split(/\s+/, $arc);
    $ArcLength = scalar(@arcwords);

    my($HNGram) = 3;


    for ($n = 1; $n <= $HNGram; $n++) {
	for ($i = 0; $i <= $ArcLength - $n; $i++) {
	    $ngram = "";
	    for ($j = $i; $j < $i+$n; $j++) {
		$ngram .= $arcwords[$j] . " ";
	    }
	    $ngram =~ s/ $//;
	    #print "NGRAM: $ngram; REF $ref\n";
	    if ($ref =~ m/$ngram/i) {
		$ArcScore++;
	    }
	}
    }
    

    # Count total possible number of n-grams for arc
    for ($TotalNGramIndex = $ArcLength, $TotalNGrams = 0; 
	 $TotalNGramIndex > ($ArcLength - $HNGram) and $TotalNGramIndex > 0;
	 $TotalNGramIndex--)
    {
	$TotalNGrams += $TotalNGramIndex;
    }

    if ($TotalNGrams == 0) {
	$ArcScore = 0;
    } else {
	$ArcScore = $ArcScore / $TotalNGrams;
    }

    return $ArcScore;
}
