#We read in the lattice and the reference translation(s) # Then we match each arc in the lattice against the reference # translations and assign a score, based on weighted n-gram match # Finally, we output a histogram of how many lattices you have above # given thresholds (1, 0.9, 0.8, 0.7, etc.) if (@ARGV != 4){ die "Usage: perl LatticeEval.pl Lattice RefTranslations(inSGML) HighestNGram ModifiedLattice\n"; } ##-------------- data items ----------------## $FinalSumScore = 0; $NumRefTranslations = 0; $Sent = 0; ##-------------- data items ----------------## ##-------------- subroutines ---------------## #sub GetAddScoreForSentence(); ##-------------- subroutines ---------------## ##-------------- files ---------------------## open(LatticeFILE,$ARGV[0]) or die "Couldn't open lattice file.\n"; open(RefTransFILE,$ARGV[1]) or die "Couldn't open reference translations file.\n"; $HNGram = $ARGV[2]; # the n of the highest n-gram to be matched open(ModLatticeFILE,">".$ARGV[3]) or die "Couldn't open modified lattice file for writing.\n"; ##-------------- files ---------------------## ##----- read in reference translations -----## while(){ #print "from ref translation file:$_\n"; #must split it up by 1) sysid and 2) sentid #put it in hashes, one for each reference translation if ($_ =~ /sysid/){ $NumRefTranslations++; #create Hash $CurHashName = "RefXs" . $NumRefTranslations; #print "CurHashName:" . $CurHashName . "\n"; $Sent = 1; } elsif ($_ =~ /seg id/){ #then keep this reference translation in the appropriate hash while($_ =~ /\s$/){ chop; } $_ =~ s/^\//; $_ =~ s/\<\/seg\>$//; $_ = " " . $_ . " "; $_ =~ tr/A-Z/a-z/; #print "inserting:|" . $_ . "|\n"; $$CurHashName{$Sent} = $_; #print $$CurHashName{$Sent} . "\n"; $Sent++; } } ##----- read in reference translations -----## ##----- set lambda based on number of reference translations ----## #print "NumRefTranslations:$NumRefTranslations\n"; $lambda = 1 / $HNGram; #print "lambda:$lambda\n"; ##----- set lambda based on number of reference translations ----## ######### for debugging ############################# #then print them all #for ($i = 1; $i <= $NumRefTranslations; $i++){ # $CurHashName = "RefXs" . $i; # print "----- now printing ref translations for ref|" . $CurHashName . "|-------\n"; # $n = keys(%$CurHashName); # print "num keys:" . $n . "\n"; # foreach $key (sort keys %$CurHashName){ # print "key:|" . $key . "|, value:|" . $$CurHashName{$key} . "|\n"; # } #} ######### for debugging ############################# ##------ then read in the lattice ------## # split on the )\n $Sent = 0; while(){ while($_ =~ /\s$/){ chop; } #print "|$_|\n"; ##------ begin reading arcs for a new sentence -------## if ($_ =~ /^$\s*$/){ print ModLatticeFILE $_ . "\n"; if ($Sent != 0){ #for debugging, print the arc representation #foreach $key (sort keys %ThisSentArcs){ # print $key . "\n"; #} #now what? #now call a subroutine that will allow you to find all # the scores for the arcs in this sentence (not arc scores, # but the scores that will go into the final sum). GetAddScoreForSentence(); } $Sent++; #print "------New Sentence, now # $Sent----\n"; } ##------ begin reading arcs for a new sentence -------## ##------ within sentence processing of arcs -------## elsif ($_ =~ /^$\s*$/){ print ModLatticeFILE $_ . "\n"; } else{ #print "."; #then get the part of the arc that is the actual translation $CurMaxArcScore = 0; # a sample arc looks as follows: # (8 10 "IN FRONT OF" 1 "L PN $LI" "@PREP") # we will split by the ", then our guy is the second entry @Parts = split(/\"/,$_); if (@Parts > 1){ $PartTransl = $Parts[1]; $PartTransl =~ tr/A-Z/a-z/; #print "Partial translation:$PartTransl\n"; #now we split this and get n-gram matches against the # reference translation(s) @ArcWords = split(/ /,$PartTransl); #$HNGram $ArcLength = @ArcWords; #print "ArcLength:$ArcLength\n"; #now, match this against the reference translation(s) for ($m = 1; $m <= $NumRefTranslations; $m++){ #get the appropriate hash name $CurHashName = "RefXs" . $m; #print "CurHashName:$CurHashName\n"; $CurRefTr = $$CurHashName{$Sent}; #print "CurRefTr:|$CurRefTr|\n"; @RefWords = split(/\s+/,$CurRefTr); $RefLength = @RefWords - 1; #print "RefLength:$RefLength\n"; for ($i = 1; $i <= $HNGram; $i++){ #first clear the score for all the $i-grams from #the previous arc $CurIGramMatch = $i . "grammatch"; $$CurIGramMatch = 0; if ($i <= $ArcLength){ #print "Now looking at $i-grams\n"; #the sliding window.... $CurIGramMax = $RefLength - $i + 1; for ($j = 0; $j <= $ArcLength-$i; $j++){ #get an i-gram starting at position j $CurNGram = " "; for ($k = 0; $k < $i; $k++){ #j is starting position, k is offset $CurNGram = $CurNGram . $ArcWords[$j+$k] . " "; } #print "CurNGram:|$CurNGram|\n"; if ($CurRefTr =~ $CurNGram){ #match!!! # then what...? $Bonus = 1/$CurIGramMax; ###$Bonus = sprintf("%.10f",$Bonus); #print "Bonus for this match:$Bonus\n"; $$CurIGramMatch += $Bonus; #print "CurIGramMatch is now:$$CurIGramMatch\n"; #also find out what indices in the sentence # are matched (beware: there might be duplicates, e.g. # "the" might match in different positions # we know what i is, so ... # build a sliding window over the reference translation for ($refindbeg = 1; $refindbeg < @RefWords-$i+1; $refindbeg++){ #build an igram starting from this position $MatchIGramRef = " "; for ($build = 0; $build < $i; $build++){ $MatchIGramRef = $MatchIGramRef . $RefWords[$refindbeg+$build] . " "; } if ($CurNGram eq $MatchIGramRef){ #print "MatchIGramRef:|$MatchIGramRef|\n"; #print "matched\n"; #then we know the indices $SpanBeg = $refindbeg; #print "SpanBeg is:|$SpanBeg|\n"; $SpanEnd = $refindbeg + $i; #print "SpanEnd (exclusive) is:|$SpanEnd|\n"; for ($acc = $SpanBeg; $acc < $SpanEnd; $acc++){ $IndsMatchedInSent{$acc} = 1; } } } } } } } # now that we have all the i-gram scores, do the weighted # average $ArcScore = 0; for ($i = 1; $i <= $HNGram; $i++){ $IGramMatch = $i . "grammatch"; #print "Bonus for all $i-gram matches:$$IGramMatch\n"; $$IGramMatch = $$IGramMatch * $lambda; #print "Adjusted bonus for all $i-gram matches:$$IGramMatch\n"; $ArcScore += $$IGramMatch; } #print "ArcScore is:$ArcScore\n"; # then replace the current winner if this arc matches # this reference translation better if ($ArcScore > $CurMaxArcScore){ $CurMaxArcScore = $ArcScore; } } #print "MaxArcScore (over all ref translations):$CurMaxArcScore\n"; #ok, now we've got a score for this arc!!!! print ModLatticeFILE $_ . " : " . $CurMaxArcScore . "\n"; #store this in a hash just for this sentence: # score -> spanned inds (as string: 1,2,3) # we have the matched indices in IndsMatchedInSent -- print them $MatchedIndsString = ""; foreach $key (sort keys %IndsMatchedInSent){ #print "matched index:$key\n"; $MatchedIndsString = $MatchedIndsString . $key . ","; } #print "-------------------------------\n"; #now insert those in a hash JUST FOR THIS SENTENCE $EntryKey = $CurMaxArcScore . "___" . $PartTransl . "___" . $MatchedIndsString; $ThisSentArcs{$EntryKey} = 1; %IndsMatchedInSent = (); if (exists $ScoresHash{$CurMaxArcScore}){ #print "Updating score in ScoresHash\n"; $ScoresHash{$CurMaxArcScore}++; } else{ #print "Inserting new entry in ScoresHash\n"; $ScoresHash{$CurMaxArcScore} = 1; } } } ##------ within sentence processing of arcs -------## } ## ----- process final sentence ----- ## #for debugging, print the arc representation # print "Number of arcs for this sentence:|" . keys(%ThisSentArcs) . "|\n"; # #foreach $key (sort { $b <=> $a } keys %ThisSentArcs){ # foreach $key (sort SortAlphabetically keys %ThisSentArcs){ # print $key . "\n"; # } #then call a subroutine that will add whatever score sum you need to # add for this sentence. GetAddScoreForSentence(); #finally, we can output the histogram of all the scores #print "Now printing entire histogram:\n"; foreach $key (sort keys %ScoresHash){ #print "key:".$key . "\t" . "Num times occurred:" . $ScoresHash{$key} . "\n"; $LatticeHistogramScore = $LatticeHistogramScore + ($key*$ScoresHash{$key}); } #also, we output a count for each of 1, 0.9, 0.8, etc. $LatticeCount = 0; print "Now printing counts for cutoff values:\n"; for ($i = sprintf("%2f",0); $i <= sprintf("%2f",1); $i = sprintf("%2f",$i+0.01)){ $Count = 0; foreach $key (keys %ScoresHash){ if ($key >= $i){ $Count += $ScoresHash{$key}; } #print "key is:$key, i is:$i\n"; if (($key >= $i)&&($key < sprintf("%2f",$i-0.01))){ #print "----falls in the range.\n"; if($BinCounts{$i}){ #print "---existed\n"; $BinCounts{$i} += $ScoresHash{$key}; } else{ #print "----didn't exist\n"; $BinCounts{$i} = $ScoresHash{$key}; } #print "----BinCounts{key} is now: $BinCounts{$i}\n"; } } $LatticeScore = $LatticeScore + ($i * $Count); $LatticeCount = $LatticeCount + $Count; } #now, finally, print out the score of this lattice print "Final sum score for lattice:" . $FinalSumScore . "\n"; ## ------------------------------------------------- ## ## ----------- begin subroutines -------------- ## ## ------------------------------------------------- ## sub GetAddScoreForSentence(){ ## ---- for debugging ---- ## $ScoreForSent = 0; ## ---- for debugging ---- ## #here, we should have access to %ThisSentArcs #foreach $key (sort { $b <=> $a } keys %ThisSentArcs){ foreach $key (sort SortAlphabetically keys %ThisSentArcs){ #print "Now considering arc:$key\n"; #first split it up into: a) score, b) translation, c) indices covered # We only need the translation to split the score into n parts, # where n is the number of words in the translation @Parts = split(/___/,$key); $Score = $Parts[0]; #print "Score is:|$Score|\n"; #partial translation @Words = split(/ /,$Parts[1]); $n = @Words; #print "n:|$n|\n"; @Indices = split(/\,/,$Parts[2]); $NumMatched = @Indices; if ($NumMatched > 0){ $PartScore = $Score / $NumMatched; #print "PartScore is:$PartScore\n"; #print "AllInds is: $Parts[2]\n"; foreach $Index (@Indices){ #print "\tOne covered index:$Index\n"; if ($Index ne ""){ #then find out how many times this index has already been # accounted for by other arcs if (exists ($ThisSentCovIndCounts{$Index})){ $ThisSentCovIndCounts{$Index}++; } else{ $ThisSentCovIndCounts{$Index} = 1; } $AccNum = $ThisSentCovIndCounts{$Index}; #print "Index $Index has already been accounted for $AccNum times.\n"; #$SumScoreArc = $SumScoreArc + e^($AccNum-1) #####$AddScore = $PartScore * (1/(exp(1)**($AccNum-1))); $AddScore = $PartScore * (1/(2**($AccNum-1))); #print "AddScore is:$AddScore\n"; $FinalSumScore = $FinalSumScore + $AddScore; $ScoreForSent += $AddScore; #print "ScoreForSent is:$ScoreForSent\n"; } } } } #print HistOutFILE "----- Sent $Sent -----\n"; #foreach $key (sort keys (%ThisSentCovIndCounts)){ # print HistOutFILE $key . "\t" . $ThisSentCovIndCounts{$key} . "\n"; #} print "Final score for sentence $Sent: $ScoreForSent\n"; %ThisSentCovIndCounts = (); %ThisSentArcs = (); } sub SortAlphabetically{ @aParts = split(/___/,$a); $aScore = $aParts[0]; $aTr = $aParts[1]; $aInds = $aParts[2]; @bParts = split(/___/,$b); $bScore = $bParts[0]; $bTr = $bParts[1]; $bInds = $bParts[2]; if ($aScore > $bScore){ return -1; } elsif ($aScore == $bScore){ if ($aTr gt $bTr){ return -1; } else{ return 1; } } else{ return 1; } }