#!/usr/local/bin/perl -w
#new evaluation metric that takes into account both precision and recall and 
#scores based on overall word order

#########################################
##         Usage                       ##
#########################################
$scriptName = $0;
$scriptName =~ s,^.*/,,;
$scriptName =~ s,^.*\\,,;

&PrintUsage if (@ARGV < 4);

#########################################
##         Process Arguments           ##
#########################################
# Stand-alone script; no require's and no use's
$argstr = " @ARGV ";
print STDOUT "$scriptName $argstr\n";
$dbgLevel = 0;
if ($argstr =~ /\s*-d\s+(.*?)\s+/) {
    $dbgLevel = $1;
}

if ($argstr =~ /\s*-t\s+(.*?)\s+/) {
    $testsgmFileName = $1;
} else {
    print STDERR "test file is not specified.\n";
    &PrintUsage();
}

if ($argstr =~ /\s*-r\s+(.*?)\s+/) {
    $refsgmFileName = $1;
} else {
    print STDERR "reference file is not specified.\n";
    &PrintUsage();
}

if ($argstr =~ /\s*-s\s+(.*?)\s+/) {
    $evalSysId = $1;
} else {
    print STDERR "system id is not specified.\n";
    &PrintUsage();
}

#########################################
##     Open log files                  ##
#########################################
open logFilePerSent,">$evalSysId.scorePerSent.log" if $dbgLevel > 0;
open debug,">debug.txt";

#########################################
##     Initialization of variables     ##
#########################################
my $hypObj;       # a candidate translation unit (corresponds a *source* unit)
my @refObjs;      # reference translations parallel to the candidate translation unit

my $hypNum = 0;   # ID of the hypothesis unit; sequential for now. sgml markup should have this?
my $score = 0;

my $docId;        # ID of the current doc
my $lineNum = 0;  # Unit number of the current hypothesis unit within the current doc
my @hyplines;
my @reflines;

my @refids;     # names of all references
my @testids;    # names of all test systems; only one of these will be scored.
my @hypdocids;  

my %evalDoc;
my %hypDoc;
my %refDoc;

my $precision;
my $recall;
my $f1;
my $matches = 0;
my $hyplength = 0;
my $reflength = 0;
my $flips = 0;
my $totFlips = 0;
my $penalty;

##############################################
##           The Main Loop:                 ##
##############################################
&ReadAllSGMFiles();

while (&GetHypRefs()) {
    &ScoreHyp();
}

$precision = $matches/$hyplength;
$recall = $matches/$reflength;
$f1 = 2 * $precision * $recall/($precision + $recall);
$penalty = 2**(-$flips/$totFlips);
$score = $f1 * $penalty;
$precision = &Trunc($precision);
$recall = &Trunc($recall);
$f1 = &Trunc($f1);
$score = &Trunc($score);
$penalty = &Trunc($penalty);

#report scores
print STDOUT "\n\nSystem,$evalSysId\nSegsScored,$hypNum\nPrecision,$precision\nRecall,$recall\n1-Factor,$f1\nSortPenalty,$penalty\nScore,$score\n";

exit 0;

##############################################
##           Subroutines                    ##
##############################################
sub ScoreHyp {

 my $precision = 0;
 my $recall = 0;
 my $f1 = 0;
 my $segmatches = 0;
 my $seghyplength = 0;
 my $segreflength = 0;
 my $segflips = 0;
 my $segTotFlips = 0;
 my $penalty;
 my $bestmatches = 0;
 my $besthyplength = 0;
 my $bestreflength = 0;
 my $bestflips = 0;
 my $bestTotFlips = 0;
 my $bestscore = 0;
 my $score = 0;
 my $bestref;

 &NormalizeText(\$hypObj);

 @hypwords = split(/ /,$hypObj);
 $seghyplength = @hypwords;

 foreach $reftrans (@refObjs) {
     &NormalizeText(\$reftrans);

     open input,">input.txt";
     print input "$hypObj\n";
     print input "$reftrans\n";
     print debug "$hypObj\n";
     print debug "$reftrans\n\n";


     @output = `java TextMatcher input.txt`;

     print debug $output[0];
     print debug $output[1];
     chop($output[0]);
     chop($output[1]);

     @refwords = split(/ /,$reftrans);
     $segreflength = @refwords; 
  
#     $segmatches = 6;
#     $segflips = 3;

     if ($output[0] == 0){
	 $score = 0;
     }

     elsif ($output[0] == 1){
	 $segflips = 0;
         $segTotFlips = 1;
         $segmatches = 1;
         $precision = $segmatches/$seghyplength;
         $recall = $segmatches/$segreflength;
         $f1 = 2 * $precision * $recall/($precision + $recall);
         $penalty = 1;
         $score = $f1 * $penalty;         
     }

     else {
     $segmatches = $output[0];
     $segflips = $output[1];
     $segTotFlips = $segmatches * ($segmatches - 1) / 2; 

     #get precision and recall
     $precision = $segmatches/$seghyplength;
     $recall = $segmatches/$segreflength; 


     #calculate score
     $f1 = 2 * $precision * $recall/($precision + $recall);
     $penalty = 2**(-$segflips/$segTotFlips);
     $score = $f1 * $penalty;
     }      

     #update info for best score
     if ($score >= $bestscore) {
         $bestscore = $score;
	 $bestmatches = $segmatches;
         $besthyplength = $seghyplength;
         $bestreflength = $segreflength;
         $bestflips = $segflips;
         $bestTotFlips = $segTotFlips;
         $bestref = $reftrans;
     }

  }

  if ($bestscore == 0){
     $score = 0;
     $precision = 0;
     $recall = 0;
     $penalty = 0;
     $bestflips = 0;
     $bestTotFlips = 0;
  }
  
  else {  
    $precision = $bestmatches/$besthyplength;
    $recall = $bestmatches/$bestreflength; 
    $f1 = 2 * $precision * $recall/($precision + $recall);
    $penalty = 2**(-$bestflips/$bestTotFlips);
    $score = $f1 * $penalty;
    $precision = &Trunc($precision);
    $recall = &Trunc($recall);
    $f1 = &Trunc($f1);
    $score = &Trunc($score);
    $penalty = &Trunc($penalty);
  }   

  #print best score to log file
  print logFilePerSent "doc_Id,$docId\n";
  print logFilePerSent "Segment ID,$lineNum\n";
  print logFilePerSent "Matches,$bestmatches\n";
  print logFilePerSent "SentenceLength,$besthyplength\nPrecision,$precision\nRecall,$recall\n1-Factor,$f1\nSortPenalty,$penalty\nScore,$score\n";
  print logFilePerSent "ref: $bestref\n\n";

  #update aggregate statistics
  $matches += $bestmatches;
  $hyplength += $besthyplength;
  $reflength += $bestreflength;
  $flips += $bestflips;
  $totFlips += $bestTotFlips;

}

# Read all SGM files
# Each sgm file is a sequence of documents. Each document is a
# sequence of units such as "paragraphs".
# A unit is enclosed in <seg> and </seg>
# <seg> can have optional attribute called seg_Id. E.g.
# <seg seg_ID="23"> blahblah </seg>
# If there is a mismatch in the number of lines in parallel files, the unit becomes 
# the whole file. 
# Recommendation while preparing references and test material: 
# Select paragraphs as units. 
# Reason: translators usually honor paragraph boundaries but occasionally break or
# merge sentences within a paragraph. Aligning references by sentences is painful
# and is not worth it since scoring BLEU at the paragraph level is just fine.
sub ReadAllSGMFiles {
    my $sys_Id = &ReadAllDocs($testsgmFileName, \%hypDoc, \@testids);
    if ($evalSysId eq "") {
       print STDERR "Systems seen: @testids. Evaluating $sys_Id\n";
       print STDERR "If you want to evaluate another system, specify the system by the -s option\n" if @testids > 1;
       $evalSysId = $sys_Id unless $evalSysId;
    }

    %evalDoc = %{$hypDoc{$evalSysId}};
    @hypdocids = sort keys %evalDoc;
    
    #print "DocIds found ",$#hypdocids+1,"\n";
    
    &ReadAllDocs($refsgmFileName, \%refDoc, \@refids);
    return;
}

sub ReadAllDocs {
    my ($fn, $hashptr, $arrptr) = @_;
    open(FIL, $fn) or die "Cannot read file $fn: $!";
    
    #read till the first <DOC"; #"
    while (<FIL>) {
	last if /<DOC/i;
    }
    
    my $doc_Id;
    my $sys_Id;
    if(/docid=\"(.*?)\".*sysid=\"(.*?)\"/i){
	    $doc_Id  = $1;
	    $sys_Id  = $2;
    }
    else{		#for ISI system, where the format is reversed
    	m/sysid=\"(.*?)\".*docid=\"(.*?)\"/i;
    	$sys_Id  = $1;
	$doc_Id  = $2;
    }
 
    my $docstr = "";
    # read each doc into a single line and store 
    my $numDocs = 0;
    while (<FIL>) {
	s/[\n\r]+/ /g;  
	if (/<DOC\s+docid=\"(.*?)\".*sysid=\"(.*?)\"/i) { #"
	    ${$$hashptr{$sys_Id}}{$doc_Id} = $docstr; #
	    $doc_Id = $1;
            $sys_Id = $2;
            $docstr = "";
            $numDocs++;
            next;
         }
         elsif(/<DOC\s+sysid=\"(.*?)\".*docid=\"(.*?)\"/i){
            ${$$hashptr{$sys_Id}}{$doc_Id} = $docstr; #
	    $doc_Id = $2;
            $sys_Id = $1;
            $docstr = "";
            $numDocs++;
            next;
         }         
         
         $docstr .= "$_ ";
     }
     close(FIL);
     
     #print "DocString is $docstr\n";
     ${$$hashptr{$sys_Id}}{$doc_Id} = $docstr; #
     $numDocs++;
     

     @$arrptr = sort keys %$hashptr;
     foreach my $id (@$arrptr) {
        my %thissysdocs      = %{$$hashptr{$id}};
	@thissysdocids = sort keys %thissysdocs;
	print STDERR "$id: Number of docs = ", 0 + @thissysdocids, "\n" if $dbgLevel > 1;
	print STDERR "@thissysdocids\n\n"  if $dbgLevel > $ngrSize + 1;
     }

   return $sys_Id;
}


# Subroutine to get the next unit of the test and the reference translations.
sub GetHypRefs {         
        
    if ($lineNum < @hyplines) {
	$hypObj = $hyplines[$lineNum];
	if ($hypObj =~ /^\s*$/) {
	    $lineNum++;
	    print STDERR "Skipped a blank line, 0-based line number = $lineNum\n" if $dbgLevel > 1;
	    return &GetHypRefs();
	}
	@refObjs = ();
	for ($i = 0; $i < @reflines; $i++) {
	    push @refObjs, $reflines[$i][$lineNum];
	}
	$lineNum++;
	$hypNum++;
	return 1;
    }

    return 0 unless &SegmentNextParallelDoc();
    if (@hyplines == &QEveryRowHasSameNumberOfColumns(\@reflines)) {
	return &GetHypRefs();
    } 

    print STDERR "Line number mismatch in files. Will treat each file as a single unit\n";
    $hypObj = join("\n", @hyplines);
    @refObjs = ();
    for ($i = 0; $i < @reflines; $i++) {
	push @refObjs, join("\n", @{$reflines[$i]});
    }

    @hyplines = ();
    @reflines = ();
    $hypNum++;
    return 1;
}

# Given a matrix as an array of arrays, determine if every row has the
# same number of columns or not. Used in this script to check if parallel files
# have the same number of lines.
sub QEveryRowHasSameNumberOfColumns {

    # return the number of columns, if all rows have the same number of columns
    # return 0 else

    my $matPtr   = shift;
    my $numRows  = @$matPtr;
    my $prevNumCols = 0 + @{$$matPtr[0]};
    for ($i = 1; $i < $numRows; $i++) {
	return 0 if ($prevNumCols != 0 + @{$$matPtr[$i]});
    }
    return $prevNumCols;
}

# For pretty printing of numbers.
sub Trunc {
    my $num = shift;
    $num += 0.00005 if $num != int($num);
    $num =~ s/\.(\d{4}).*/.$1/ unless $num =~ /e/;
    return $num;
}

sub PrintUsage {
    print "\nUSAGE: perl $scriptName  -s <system2test> -t <test_file> -r <referece_file> [-d dbglevel] [-n ngram_size] \n";
    print "\t   By default ngram_size is 4 so that 1-, 2-, 3-, and 4-grams are matched.\n";
    exit;
}

# This function supports sgm-marked up docs in one single file
# The parallel docs are already read in, each doc as a single string.
# Break up the string into units and complain if mismatch in units.
sub SegmentNextParallelDoc {
    $lineNum = 0;
	
    #print "\n++ In SegmentNextParallelDoc\nDocId has ", @hypdocids,"\n";
    return 0 unless @hypdocids > 0;

    # what does "next" mean?
    $docId = shift(@hypdocids);
    #print "Current docId is $docId\n";
    
    
    my $hypstr = $evalDoc{$docId};    
    #print "HypStr is ----$hypstr\n";
    
    @hyplines = &GetUnits($hypstr);	# get each segments in plain text format

    $refNum = 0;
    @reflines = ();

    foreach $refId (@refids) {
    	#print "RefId = $refId\n";
        my %thisrefdocs      = %{$refDoc{$refId}};
	if (defined($thisrefdocs{$docId})) {
	    my $thisrefstr = $thisrefdocs{$docId};
	    my @lines = &GetUnits($thisrefstr);
	    push @reflines, [ @lines ];
	    $refNum++;
	    print STDERR "Mismatched lines for doc $docId: $refId:test = ", 0 + @lines, ":", 0 + @hyplines, "\n" unless @hyplines == @lines;
  	}
    }    
    print STDERR "$refNum reference translations found for doc $docId\n" if $dbgLevel > 1;
    if ($refNum == 0) {
	die "ERROR: No reference translations for Hypothesis $docId\n";
    }
    
    return 1;
}

sub GetUnits {
    my $str = $_[0];
    my @units;
    
    #modified by Joy, joy@cs.cmu.edu
    #March 6, 2003
    #to adapt to NIST SGML where <seg id= ..> is allowed        
    #print "---\t$str\n";
    $str =~s/<seg\s+id\s*=\s*[0-9]+\s*>/<seg>/ig;  
    #print "===\t$str\n";          
    
    
    while ($str =~ s,<seg>(.*?)</seg>,,) {
	push(@units, $1);
    }
    
    return @units;
}

# Some simple processing of the translations. Lowercasing is the main aspect.
# (Are we being too liberal by comparing translations after lowercasing?)
# Numbers in numeric form are rendered differently by some commercial systems, 
# so normalize the spacing conventions in numbers.
# There is no end to text normalization. For example, "1" and "one" are the same,
# aren't they? Before going ballistic, let us recall the "Keep It Simple" Principle.
sub NormalizeText {
    my $strPtr = shift;

# language-independent part:
    $$strPtr =~ s/^\s+//;
    $$strPtr =~ s/\n/ /g; # join lines
    $$strPtr =~ s/(\d)\s+(\d)/$1$2/g;  #join digits

# language-dependent part (assuming Western languages):
    $$strPtr =~ tr/[A-Z]/[a-z]/;
    $$strPtr =~ s/\-/ /g;
    $$strPtr =~ s/[\(\)\"\'\.\$,\%\;\/]//g;

#    $$strPtr =~ s/([^a-z0-9\-\'\.,])/ $1 /g; # tokenize punctuation (except for alphanumerics, "-", "'", ".", ",")
#    $$strPtr =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
#    $$strPtr =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
#    $$strPtr =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
    $$strPtr =~ s/\s+/ /g; # one space only between words
    $$strPtr =~ s/^\s+//;  # no leading space
    $$strPtr =~ s/\s+$//;  # no trailing space
}

close(debug);
close(logFilePerSent);
close(input);


