use mStageMatcher; # the actual matcher module use Getopt::Long; # for processing the command line use WordNet::QueryData; # to interface with WordNet # Check for no arguments if($#ARGV == -1) { print "Usage: perl evaluate.pl ... [--stop FILE] [--maxComputations N] [--detailOut FILE [--minFMeasureForDetailOut N]]\n"; exit; } # save the command line for printing later on my $commandLine = join " ", @ARGV; # now get the options! GetOptions("stop=s", "maxComputations=i", "detailOut=s", "minFMeasureForDetailOut=f"); # get the input file name my $inputFile = shift; # get the output file name my $outputFile = shift; open (OUT, ">$outputFile") || die "Couldn't open output file $outputFile for writing\n"; # output the command line to a text file open (COMMANDLINE, ">${outputFile}_commandLine.txt") || die "Couldn't open command line output file {$outputFile}_commandLine.txt for writing\n"; print COMMANDLINE "Program run at: ", (scalar localtime()), "\n"; print COMMANDLINE "Command line: $commandLine\n"; close (COMMANDLINE); # anything left on the command line is a set of modules my @modules = @ARGV; # initialize hash to be input to match subroutine my %inputHash = (); # if maxComputations has been specified, set it up if (defined $opt_maxComputations) { $inputHash{"maxComputations"} = $opt_maxComputations; } # check if we need details output if (defined $opt_detailOut) { $inputHash{"details"} = 1; open (DETAILS, ">$opt_detailOut") || die "Couldn't open details output file $opt_detailOut for writing\n"; # set the minimum f-measure for printing details to 0 (this will be # over-ridden by the --minFMeasureForDetailOut switch if used) $minFMeasureForDetailOut = 0; } if (defined $opt_minFMeasureForDetailOut) { die "ERROR: Switch --minFMeasureForDetailOut used without --detailOut switch. Please use --detailOut switch to specify output file for details\n" unless (defined $opt_detailOut); $minFMeasureForDetailOut = $opt_minFMeasureForDetailOut; } # put in the modules array @{$inputHash{"modules"}} = @modules; # get stop words if provided if (defined $opt_stop) { open (STOP, "$opt_stop") || die "Couldn't open stop file $opt_stop\n"; # create global stop hash %stopHash = (); while () { chomp; $stopHash{$_} = 1; } close(STOP); } # Check the list of modules to see if we need to initialize WordNet foreach (@modules) { if (/^wn_/) { # WordNet based modules are assumed to start with "wn_" $inputHash{"wn"} = WordNet::QueryData->new; last; } } # Make sure pruning is set to "on" $inputHash{"prune"} = 1; # now go through the input file, take pairs of successive lines and # get match scores etc open (INPUT, $inputFile) || die "Couldn't open $inputFile\n"; my $index = 0; my @totalMatches = 0; my @totalFMeasure = 0; my $totalChunks = 0; my $totalAvgChunkLength = 0; # print headings print OUT "Index\t"; foreach (@modules) { print OUT "# matches after $_\tF--measure after $_\t"; } print OUT "# chunks\tAverage words per chunk\n"; while (my $firstString = ) { # normalize the text and remove stop words if requested chomp $firstString; normalizeText(\$firstString); removeStopWords(\$firstString) if (defined $opt_stop); $inputHash{"firstString"} = $firstString; my @tempArray = split /\s+/, $firstString; my $numWordsInFirstString = $#tempArray + 1; my $secondString = ; # normalize the text and remove stop words if requested chomp $secondString; normalizeText(\$secondString); removeStopWords(\$secondString) if (defined $opt_stop); $inputHash{"secondString"} = $secondString; @tempArray = split /\s+/, $secondString; my $numWordsInSecondString = $#tempArray + 1; # Now do the actual matching! match(\%inputHash); print OUT "$index\t"; # Print the # of matches and f--measure at the end of each stage my $totMatchesSoFar = 0; for (my $i = 0; $i <= $#{$inputHash{"matchScore"}}; $i++) { $totMatchesSoFar += ${$inputHash{"matchScore"}}[$i][0]; my $fMeasure = sprintf "%0.2f", computeFMeasure($totMatchesSoFar, $numWordsInFirstString, $numWordsInSecondString); print OUT "$totMatchesSoFar\t"; print OUT "$fMeasure\t"; # add the fmeasure into the input hash for printing in the details later on if needed ${$inputHash{"fMeasures"}}[$i] = $fMeasure if (defined $opt_detailOut); $totalMatches[$i] += $totMatchesSoFar; $totalFMeasure[$i] += $fMeasure; } # Print num chunks and avg words per chunk print OUT $inputHash{"numChunks"}, "\t", $inputHash{"avgChunkLength"}, "\n"; $totalChunks += $inputHash{"numChunks"}; $totalAvgChunkLength += $inputHash{"avgChunkLength"}; # process the details and output to details file if last fMeasure less then threshold print DETAILS processDetails(%inputHash) if ((defined $opt_detailOut) && (${$inputHash{"fMeasures"}}[$#{$inputHash{"fMeasures"}}] < $minFMeasureForDetailOut)); $index++; } close(INPUT); close(DETAILS) if (defined $opt_detailOut); # print the averages print OUT "Averages\t"; for (my $i = 0; $i <= $#modules; $i++) { $totalMatches[$i] /= $index; $totalFMeasure[$i] /= $index; printf OUT "%0.2f\t%0.2f\t", $totalMatches[$i], $totalFMeasure[$i]; } $totalChunks /= $index; $totalAvgChunkLength /= $index; printf OUT "%0.2f\t%0.2f\n", $totalChunks, $totalAvgChunkLength; sub processDetails { my %inputHash = @_; # first print the two strings my $outputString = "First string: " . $inputHash{"firstString"} . "\n"; $outputString .= "Second string: " . $inputHash{"secondString"} . "\n"; # then print the fmeasures $outputString .= "Fmeasures: "; for (my $i = 0; $i <= $#{$inputHash{"modules"}}; $i++) { $outputString .= ${$inputHash{"modules"}}[$i] . " " . ${$inputHash{"fMeasures"}}[$i] . " "; } $outputString .= "\n"; # output the details $outputString .= $inputHash{"detailString"} . "\n\n\n"; return($outputString); } sub computeFMeasure { my $totMatchesSoFar = shift; my $numWordsInFirstString = shift; my $numWordsInSecondString = shift; my $recall = $totMatchesSoFar / $numWordsInFirstString; my $precision = $totMatchesSoFar / $numWordsInSecondString; my $fMeasure = ($recall && $precision) ? (2 * ($recall * $precision) / ($recall + $precision)) : 0; return($fMeasure); } # subroutine to make text more "well behaved". Subroutine copied out of Alon's Meteor script 8/8/2004. # Changes to the code: 1) Removed commented code, 2) Removed conditional call to Porter Stemmer, # 3) Normalized ;-) the name of the subroutine from "NormalizeText" to "normalizeText" sub normalizeText { my $strPtr = shift; # language-independent part: $$strPtr =~ s/^\s+//; $$strPtr =~ s/\n/ /g; # join lines $$strPtr =~ s/(\d)\s+(\d)/$1$2/g; #join digits # language-dependent part (assuming Western languages): $$strPtr =~ tr/[A-Z]/[a-z]/; $$strPtr =~ s/\-/ /g; $$strPtr =~ s/[\(\)\"\'\.\$,\%\;\/]/ /g; #KS 200204 $$strPtr =~ s/[^a-z0-9 ]/ /g; #KS 200204 $$strPtr =~ s/\s+/ /g; # one space only between words $$strPtr =~ s/^\s+//; # no leading space $$strPtr =~ s/\s+$//; # no trailing space } # subroutine to remove stop words from the given sentence sub removeStopWords { $strPtr = shift; my @inWords = split /\s+/, $$strPtr; my @outWords = (); foreach (@inWords) { push @outWords, $_ unless (defined $stopHash{$_}); } $$strPtr = join " ", @outWords; }