#We read in the lattice and the reference translation(s)
# Then we match each arc in the lattice against the reference
# translations and assign a score, based on weighted n-gram match
# Finally, we output a histogram of how many lattices you have above
# given thresholds (1, 0.9, 0.8, 0.7, etc.)

if (@ARGV != 4){
  die "Usage: perl LatticeEval.pl Lattice RefTranslations(inSGML) HighestNGram ModifiedLattice\n";
}



##-------------- data items ----------------##
$FinalSumScore = 0;
$NumRefTranslations = 0;
$Sent = 0;
##-------------- data items ----------------##

##-------------- subroutines ---------------##
#sub GetAddScoreForSentence(); 
##-------------- subroutines ---------------##


##-------------- files ---------------------##
open(LatticeFILE,$ARGV[0]) or die "Couldn't open lattice file.\n";
open(RefTransFILE,$ARGV[1]) or die "Couldn't open reference translations file.\n";
$HNGram = $ARGV[2]; # the n of the highest n-gram to be matched
open(ModLatticeFILE,">".$ARGV[3]) or die "Couldn't open modified lattice file for writing.\n";
##-------------- files ---------------------##


##----- read in reference translations -----##
while(<RefTransFILE>){

  #print "from ref translation file:$_\n";
   
  #must split it up by 1) sysid and 2) sentid
  #put it in hashes, one for each reference translation
  if ($_ =~ /sysid/){
    $NumRefTranslations++;
    #create Hash
    $CurHashName = "RefXs" . $NumRefTranslations;
    #print "CurHashName:" . $CurHashName . "\n";
    $Sent = 1;
  }
  elsif ($_ =~ /seg id/){
    #then keep this reference translation in the appropriate hash
    while($_ =~ /\s$/){
      chop;
    }
    $_ =~ s/^\<seg id=[0-9]+\>//;
    $_ =~ s/\<\/seg\>$//;
    $_ = " " . $_ . " ";
    $_ =~ tr/A-Z/a-z/;
    #print "inserting:|" . $_ . "|\n";
    $$CurHashName{$Sent} = $_;
    #print $$CurHashName{$Sent} . "\n";
    $Sent++;
  }
}
##----- read in reference translations -----##


##----- set lambda based on number of reference translations ----##
#print "NumRefTranslations:$NumRefTranslations\n";
$lambda = 1 / $HNGram;
#print "lambda:$lambda\n";
##----- set lambda based on number of reference translations ----##

######### for debugging #############################
#then print them all
#for ($i = 1; $i <= $NumRefTranslations; $i++){
#  $CurHashName = "RefXs" . $i;
#  print "----- now printing ref translations for ref|" . $CurHashName . "|-------\n";
#  $n =  keys(%$CurHashName);
#  print "num keys:" . $n . "\n"; 
#  foreach $key (sort keys %$CurHashName){
#    print "key:|" . $key . "|, value:|" . $$CurHashName{$key} . "|\n";
#  }
#}
######### for debugging #############################


##------ then read in the lattice ------##
# split on the )\n
$Sent = 0;
while(<LatticeFILE>){
  
  while($_ =~ /\s$/){
    chop;
  }

  #print "|$_|\n";
  ##------ begin reading arcs for a new sentence -------##
  if ($_ =~ /^\(\s*$/){

    print ModLatticeFILE $_ . "\n";

    if ($Sent != 0){
      #for debugging, print the arc representation
      #foreach $key (sort keys %ThisSentArcs){
      # print $key . "\n";
      #}
      #now what?
      #now call a subroutine that will allow you to find all
      # the scores for the arcs in this sentence (not arc scores,
      # but the scores that will go into the final sum).
      GetAddScoreForSentence();
    }
        
    $Sent++;
    #print "------New Sentence, now # $Sent----\n";
  }
  ##------ begin reading arcs for a new sentence -------##

  ##------ within sentence processing of arcs    -------##
  elsif ($_ =~ /^\)\s*$/){
    print ModLatticeFILE $_ . "\n";
  }
  else{
    
    #print ".";

    #then get the part of the arc that is the actual translation

    $CurMaxArcScore = 0;

    # a sample arc looks as follows:
    #  (8 10 "IN FRONT OF" 1 "L PN $LI" "@PREP")
    # we will split by the ", then our guy is the second entry
    @Parts = split(/\"/,$_);
    if (@Parts > 1){
      $PartTransl = $Parts[1];
      $PartTransl =~ tr/A-Z/a-z/;
      #print "Partial translation:$PartTransl\n";
      #now we split this and get n-gram matches against the
      # reference translation(s)
      @ArcWords = split(/ /,$PartTransl);
      #$HNGram
      $ArcLength = @ArcWords;
      #print "ArcLength:$ArcLength\n";
      #now, match this against the reference translation(s)
      for ($m = 1; $m <= $NumRefTranslations; $m++){
	#get the appropriate hash name
	$CurHashName = "RefXs" . $m;
	#print "CurHashName:$CurHashName\n";
	$CurRefTr = $$CurHashName{$Sent};
	#print "CurRefTr:|$CurRefTr|\n";
	@RefWords = split(/\s+/,$CurRefTr);
	$RefLength = @RefWords - 1;
	#print "RefLength:$RefLength\n";
	
	for ($i = 1; $i <= $HNGram; $i++){
	  #first clear the score for all the $i-grams from
	  #the previous arc
	  $CurIGramMatch = $i . "grammatch";
	  $$CurIGramMatch = 0;
	  if ($i <= $ArcLength){
	    #print "Now looking at $i-grams\n";
	    #the sliding window....
	    $CurIGramMax = $RefLength - $i + 1;
	    for ($j = 0; $j <= $ArcLength-$i; $j++){
	      #get an i-gram starting at position j
	      
	      $CurNGram = " ";
	      for ($k = 0; $k < $i; $k++){
		#j is starting position, k is offset
		$CurNGram = $CurNGram . $ArcWords[$j+$k] . " ";
	      }
	      #print "CurNGram:|$CurNGram|\n";
	      if ($CurRefTr =~ $CurNGram){
		#match!!!
		# then what...?
		$Bonus = 1/$CurIGramMax;
		###$Bonus = sprintf("%.10f",$Bonus);
		#print "Bonus for this match:$Bonus\n";
		$$CurIGramMatch += $Bonus;
		#print "CurIGramMatch is now:$$CurIGramMatch\n";

		#also find out what indices in the sentence
		# are matched (beware: there might be duplicates, e.g.
		# "the" might match in different positions
		# we know what i is, so ...
		# build a sliding window over the reference translation
		for ($refindbeg = 1; $refindbeg < @RefWords-$i+1; $refindbeg++){
		  #build an igram starting from this position
		  $MatchIGramRef = " ";
		  for ($build = 0; $build < $i; $build++){
		    $MatchIGramRef = $MatchIGramRef . $RefWords[$refindbeg+$build] . " "; 
		  }
		  if ($CurNGram eq $MatchIGramRef){
		    #print "MatchIGramRef:|$MatchIGramRef|\n";
		    #print "matched\n";
		    #then we know the indices
		    $SpanBeg = $refindbeg;
		    #print "SpanBeg is:|$SpanBeg|\n";
		    $SpanEnd = $refindbeg + $i;
		    #print "SpanEnd (exclusive) is:|$SpanEnd|\n";
		    for ($acc = $SpanBeg; $acc < $SpanEnd; $acc++){
		      $IndsMatchedInSent{$acc} = 1;
		    }
		  }
		}
	      }
	    }
	  }
	}

	# now that we have all the i-gram scores, do the weighted
	#  average
	$ArcScore = 0;
	for ($i = 1; $i <= $HNGram; $i++){
	  $IGramMatch = $i . "grammatch";
	  #print "Bonus for all $i-gram matches:$$IGramMatch\n";
	  $$IGramMatch = $$IGramMatch * $lambda;
	  #print "Adjusted bonus for all $i-gram matches:$$IGramMatch\n";
	  $ArcScore += $$IGramMatch;
	}
	#print "ArcScore is:$ArcScore\n";
	
	# then replace the current winner if this arc matches
	# this reference translation better
	if ($ArcScore > $CurMaxArcScore){
	  $CurMaxArcScore = $ArcScore;
	}
      }

      #print "MaxArcScore (over all ref translations):$CurMaxArcScore\n";
      #ok, now we've got a score for this arc!!!!

      print ModLatticeFILE $_ . " : " . $CurMaxArcScore . "\n";

      #store this in a hash just for this sentence: 
      # score -> spanned inds (as string: 1,2,3)
      # we have the matched indices in IndsMatchedInSent -- print them
      $MatchedIndsString = "";
      foreach $key (sort keys %IndsMatchedInSent){
	#print "matched index:$key\n";
	$MatchedIndsString = $MatchedIndsString . $key . ",";
      }
      #print "-------------------------------\n";
      #now insert those in a hash JUST FOR THIS SENTENCE
      $EntryKey = $CurMaxArcScore . "___" . $PartTransl . "___" . $MatchedIndsString;
      $ThisSentArcs{$EntryKey} = 1;

      %IndsMatchedInSent = ();

      if (exists $ScoresHash{$CurMaxArcScore}){
	#print "Updating score in ScoresHash\n";
	$ScoresHash{$CurMaxArcScore}++;
      }
      else{
	#print "Inserting new entry in ScoresHash\n";
	$ScoresHash{$CurMaxArcScore} = 1;
      }
    }
  }
  ##------ within sentence processing of arcs    -------##
}

## ----- process final sentence ----- ##
#for debugging, print the arc representation
#  print "Number of arcs for this sentence:|" . keys(%ThisSentArcs) . "|\n";
#  #foreach $key (sort  { $b <=> $a } keys %ThisSentArcs){
#  foreach $key (sort  SortAlphabetically keys %ThisSentArcs){
#    print $key . "\n";
#  }

#then call a subroutine that will add whatever score sum you need to
# add for this sentence. 
GetAddScoreForSentence();




#finally, we can output the histogram of all the scores
#print "Now printing entire histogram:\n";
foreach $key (sort keys %ScoresHash){
  #print  "key:".$key . "\t" . "Num times occurred:" . $ScoresHash{$key} . "\n";
  $LatticeHistogramScore = $LatticeHistogramScore + ($key*$ScoresHash{$key});
}


#also, we output a count for each of 1, 0.9, 0.8, etc.
$LatticeCount = 0;
print "Now printing counts for cutoff values:\n";
for ($i = sprintf("%2f",0); $i <= sprintf("%2f",1); $i = sprintf("%2f",$i+0.01)){

  $Count = 0;
  foreach $key (keys %ScoresHash){
    if ($key >= $i){
      $Count += $ScoresHash{$key};
    }
    #print "key is:$key, i is:$i\n";
    if (($key >= $i)&&($key < sprintf("%2f",$i-0.01))){
      #print "----falls in the range.\n";
      if($BinCounts{$i}){
	#print "---existed\n";
	$BinCounts{$i} += $ScoresHash{$key};
      }
      else{
	#print "----didn't exist\n";
	$BinCounts{$i} = $ScoresHash{$key};
      }
      #print "----BinCounts{key} is now: $BinCounts{$i}\n";
    }

  }
  $LatticeScore = $LatticeScore + ($i * $Count);
  $LatticeCount = $LatticeCount + $Count;
  
}


#now, finally, print out the score of this lattice
print "Final sum score for lattice:" . $FinalSumScore . "\n";


## ------------------------------------------------- ##
##  -----------  begin subroutines  --------------   ##
## ------------------------------------------------- ##

sub GetAddScoreForSentence(){
  
  ## ---- for debugging ---- ##
  $ScoreForSent = 0;
  ## ---- for debugging ---- ##
  
  #here, we should have access to %ThisSentArcs
  #foreach $key (sort  { $b <=> $a } keys %ThisSentArcs){
  foreach $key (sort  SortAlphabetically keys %ThisSentArcs){
  
    #print "Now considering arc:$key\n";
    
    #first split it up into: a) score, b) translation, c) indices covered
    # We only need the translation to split the score into n parts,
    # where n is the number of words in the translation
    @Parts = split(/___/,$key);
    $Score = $Parts[0];
    #print "Score is:|$Score|\n";
    
    #partial translation
    @Words = split(/ /,$Parts[1]);
    $n = @Words;
    #print "n:|$n|\n";
    
    @Indices = split(/\,/,$Parts[2]);
    $NumMatched = @Indices;

    if ($NumMatched > 0){

      $PartScore = $Score / $NumMatched;
      #print "PartScore is:$PartScore\n";

      #print "AllInds is: $Parts[2]\n";
      foreach $Index (@Indices){
	#print "\tOne covered index:$Index\n";
	if ($Index ne ""){
	  #then find out how many times this index has already been
	  #  accounted for by other arcs
	  if (exists ($ThisSentCovIndCounts{$Index})){
	    $ThisSentCovIndCounts{$Index}++;
	  }
	  else{
	    $ThisSentCovIndCounts{$Index} = 1;
	  }
	  $AccNum = $ThisSentCovIndCounts{$Index};
	  #print "Index $Index has already been accounted for $AccNum times.\n";
	  
	  #$SumScoreArc = $SumScoreArc + e^($AccNum-1)
	  #####$AddScore =  $PartScore * (1/(exp(1)**($AccNum-1)));
	  $AddScore =  $PartScore * (1/(2**($AccNum-1)));
	  #print "AddScore is:$AddScore\n";
	  $FinalSumScore = $FinalSumScore + $AddScore;
	  $ScoreForSent += $AddScore;
	  #print "ScoreForSent is:$ScoreForSent\n";
	}
      }
    }
  }
  #print HistOutFILE "----- Sent $Sent -----\n";
  #foreach $key (sort keys (%ThisSentCovIndCounts)){
  #  print HistOutFILE $key . "\t" . $ThisSentCovIndCounts{$key} . "\n";
  #}

  print "Final score for sentence $Sent: $ScoreForSent\n";
  %ThisSentCovIndCounts = ();
  %ThisSentArcs = ();
}


sub SortAlphabetically{
  @aParts = split(/___/,$a);
  $aScore = $aParts[0];
  $aTr = $aParts[1];
  $aInds = $aParts[2];
  @bParts = split(/___/,$b);
  $bScore = $bParts[0];
  $bTr = $bParts[1];
  $bInds = $bParts[2];
  if ($aScore > $bScore){
    return -1;
  }
  elsif ($aScore == $bScore){
    if ($aTr gt $bTr){
      return -1;
    }
    else{
      return 1;
    }
  }
  else{
    return 1;
  }
}
