use mStageMatcher; # the actual matcher module
use Getopt::Long; # for processing the command line
use WordNet::QueryData; # to interface with WordNet
# Check for no arguments
if($#ARGV == -1) {
print "Usage: perl evaluate.pl ...
[--stop FILE]
[--maxComputations N]
[--detailOut FILE [--minFMeasureForDetailOut N]]\n";
exit;
}
# save the command line for printing later on
my $commandLine = join " ", @ARGV;
# now get the options!
GetOptions("stop=s", "maxComputations=i", "detailOut=s", "minFMeasureForDetailOut=f");
# get the input file name
my $inputFile = shift;
# get the output file name
my $outputFile = shift;
open (OUT, ">$outputFile") || die "Couldn't open output file $outputFile for writing\n";
# output the command line to a text file
open (COMMANDLINE, ">${outputFile}_commandLine.txt") || die "Couldn't open command line output file {$outputFile}_commandLine.txt for writing\n";
print COMMANDLINE "Program run at: ", (scalar localtime()), "\n";
print COMMANDLINE "Command line: $commandLine\n";
close (COMMANDLINE);
# anything left on the command line is a set of modules
my @modules = @ARGV;
# initialize hash to be input to match subroutine
my %inputHash = ();
# if maxComputations has been specified, set it up
if (defined $opt_maxComputations) {
$inputHash{"maxComputations"} = $opt_maxComputations;
}
# check if we need details output
if (defined $opt_detailOut) {
$inputHash{"details"} = 1;
open (DETAILS, ">$opt_detailOut") || die "Couldn't open details output file $opt_detailOut for writing\n";
# set the minimum f-measure for printing details to 0 (this will be
# over-ridden by the --minFMeasureForDetailOut switch if used)
$minFMeasureForDetailOut = 0;
}
if (defined $opt_minFMeasureForDetailOut) {
die "ERROR: Switch --minFMeasureForDetailOut used without --detailOut switch. Please use --detailOut switch to specify output file for details\n" unless (defined $opt_detailOut);
$minFMeasureForDetailOut = $opt_minFMeasureForDetailOut;
}
# put in the modules array
@{$inputHash{"modules"}} = @modules;
# get stop words if provided
if (defined $opt_stop) {
open (STOP, "$opt_stop") || die "Couldn't open stop file $opt_stop\n";
# create global stop hash
%stopHash = ();
while () {
chomp;
$stopHash{$_} = 1;
}
close(STOP);
}
# Check the list of modules to see if we need to initialize WordNet
foreach (@modules) {
if (/^wn_/) { # WordNet based modules are assumed to start with "wn_"
$inputHash{"wn"} = WordNet::QueryData->new;
last;
}
}
# Make sure pruning is set to "on"
$inputHash{"prune"} = 1;
# now go through the input file, take pairs of successive lines and
# get match scores etc
open (INPUT, $inputFile) || die "Couldn't open $inputFile\n";
my $index = 0;
my @totalMatches = 0;
my @totalFMeasure = 0;
my $totalChunks = 0;
my $totalAvgChunkLength = 0;
# print headings
print OUT "Index\t";
foreach (@modules) {
print OUT "# matches after $_\tF--measure after $_\t";
}
print OUT "# chunks\tAverage words per chunk\n";
while (my $firstString = ) {
# normalize the text and remove stop words if requested
chomp $firstString;
normalizeText(\$firstString);
removeStopWords(\$firstString) if (defined $opt_stop);
$inputHash{"firstString"} = $firstString;
my @tempArray = split /\s+/, $firstString;
my $numWordsInFirstString = $#tempArray + 1;
my $secondString = ;
# normalize the text and remove stop words if requested
chomp $secondString;
normalizeText(\$secondString);
removeStopWords(\$secondString) if (defined $opt_stop);
$inputHash{"secondString"} = $secondString;
@tempArray = split /\s+/, $secondString;
my $numWordsInSecondString = $#tempArray + 1;
# Now do the actual matching!
match(\%inputHash);
print OUT "$index\t";
# Print the # of matches and f--measure at the end of each stage
my $totMatchesSoFar = 0;
for (my $i = 0; $i <= $#{$inputHash{"matchScore"}}; $i++) {
$totMatchesSoFar += ${$inputHash{"matchScore"}}[$i][0];
my $fMeasure = sprintf "%0.2f", computeFMeasure($totMatchesSoFar, $numWordsInFirstString, $numWordsInSecondString);
print OUT "$totMatchesSoFar\t";
print OUT "$fMeasure\t";
# add the fmeasure into the input hash for printing in the details later on if needed
${$inputHash{"fMeasures"}}[$i] = $fMeasure if (defined $opt_detailOut);
$totalMatches[$i] += $totMatchesSoFar;
$totalFMeasure[$i] += $fMeasure;
}
# Print num chunks and avg words per chunk
print OUT $inputHash{"numChunks"}, "\t", $inputHash{"avgChunkLength"}, "\n";
$totalChunks += $inputHash{"numChunks"};
$totalAvgChunkLength += $inputHash{"avgChunkLength"};
# process the details and output to details file if last fMeasure less then threshold
print DETAILS processDetails(%inputHash)
if ((defined $opt_detailOut) && (${$inputHash{"fMeasures"}}[$#{$inputHash{"fMeasures"}}] < $minFMeasureForDetailOut));
$index++;
}
close(INPUT);
close(DETAILS) if (defined $opt_detailOut);
# print the averages
print OUT "Averages\t";
for (my $i = 0; $i <= $#modules; $i++) {
$totalMatches[$i] /= $index;
$totalFMeasure[$i] /= $index;
printf OUT "%0.2f\t%0.2f\t", $totalMatches[$i], $totalFMeasure[$i];
}
$totalChunks /= $index;
$totalAvgChunkLength /= $index;
printf OUT "%0.2f\t%0.2f\n", $totalChunks, $totalAvgChunkLength;
sub processDetails {
my %inputHash = @_;
# first print the two strings
my $outputString = "First string: " . $inputHash{"firstString"} . "\n";
$outputString .= "Second string: " . $inputHash{"secondString"} . "\n";
# then print the fmeasures
$outputString .= "Fmeasures: ";
for (my $i = 0; $i <= $#{$inputHash{"modules"}}; $i++) {
$outputString .= ${$inputHash{"modules"}}[$i] . " " . ${$inputHash{"fMeasures"}}[$i] . " ";
}
$outputString .= "\n";
# output the details
$outputString .= $inputHash{"detailString"} . "\n\n\n";
return($outputString);
}
sub computeFMeasure {
my $totMatchesSoFar = shift;
my $numWordsInFirstString = shift;
my $numWordsInSecondString = shift;
my $recall = $totMatchesSoFar / $numWordsInFirstString;
my $precision = $totMatchesSoFar / $numWordsInSecondString;
my $fMeasure = ($recall && $precision) ? (2 * ($recall * $precision) / ($recall + $precision)) : 0;
return($fMeasure);
}
# subroutine to make text more "well behaved". Subroutine copied out of Alon's Meteor script 8/8/2004.
# Changes to the code: 1) Removed commented code, 2) Removed conditional call to Porter Stemmer,
# 3) Normalized ;-) the name of the subroutine from "NormalizeText" to "normalizeText"
sub normalizeText {
my $strPtr = shift;
# language-independent part:
$$strPtr =~ s/^\s+//;
$$strPtr =~ s/\n/ /g; # join lines
$$strPtr =~ s/(\d)\s+(\d)/$1$2/g; #join digits
# language-dependent part (assuming Western languages):
$$strPtr =~ tr/[A-Z]/[a-z]/;
$$strPtr =~ s/\-/ /g;
$$strPtr =~ s/[\(\)\"\'\.\$,\%\;\/]/ /g; #KS 200204
$$strPtr =~ s/[^a-z0-9 ]/ /g; #KS 200204
$$strPtr =~ s/\s+/ /g; # one space only between words
$$strPtr =~ s/^\s+//; # no leading space
$$strPtr =~ s/\s+$//; # no trailing space
}
# subroutine to remove stop words from the given sentence
sub removeStopWords {
$strPtr = shift;
my @inWords = split /\s+/, $$strPtr;
my @outWords = ();
foreach (@inWords) {
push @outWords, $_ unless (defined $stopHash{$_});
}
$$strPtr = join " ", @outWords;
}