#!/usr/local/bin/perl58

# Oct. 18, 2001 - added smoothing

# ARGS

# Manual
# Name of training lattice
$lattice = "/usr3/eepeter/heb20050325/h1_struct-nolimits-manual-fulltrace-newscored.ltc";
# Name of source grammar file
$grammar = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Hebrew-MT/ManGrammars/heb-eng.man.25may04.gra";

$scoredgrammar = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Hebrew/scoredgrammars/heb-eng.man.25may04-relscore-smooth.gra";

# Learned 
$lattice = "/temuco/usr0/eepeter/training/h1_struct-length8-learned-fulltrace-newscored.ltc.gz";
# Name of source grammar file
 # Same as one on Avenue except with rule IDs
$grammar = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Hebrew/AlonStruct.030405.gra";

$scoredgrammar = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Hebrew/scoredgrammars/AlonStruct-BasicCompos.030405-relscore.gra";


# Read in Grammar

open(GRA, "$grammar") or die $!;
while ($line = <GRA>) {
    next if $line =~ m/^\s*;/; # Skip single line comments
    if ($line =~ m/\#\|/) { # Skip multiple line comments
	while ($line !~ m/\|\#/ and !eof(GRA)) {
	    $line = <GRA>;
	}
	next;
    }

    # Get list of rule IDs and associated source non-terminals
    if ($line =~ m/\{([^\}]+)\}/) {
	$ruleid = uc($1);
	$rulescores{$ruleid} = 0;
	while ($line !~ m/::/) {
	    $line = <GRA>;
	}
	$line =~ m/^\s*([^:]+)::/;
	$srcnt = uc($1);
	#print "SRCNT $srcnt\n";
	$ruleids{$ruleid} = $srcnt;
    }
}
close(GRA);

=comment
# Find pseudo-parse groups (ignore constraints)
open(GRA, "$grammar") or die $!;
while ($line = <GRA>) {
    next if $line =~ m/^\s*;/; # Skip single line comments
    if ($line =~ m/\#\|/) { # Skip multiple line comments
	while ($line !~ m/\|\#/ and !eof(GRA)) {
	    $line = <GRA>;
	}
	next;
    }

    # Get list of rule IDs and associated source non-terminals
    if ($line =~ m/\{([^\}]+)\}/) {
	$ruleid = uc($1);
    } elsif ($line =~ m/\-\>/) {
	$line =~ m/^\s*([^:]+)/;
	$srcnt = uc($1);
	$line =~ m/^[^\[]+\[([^\]]+)\]/;
	$srcrhs = uc($1);
	$srcrule = $srcnt . "\t" . $srcrhs;
	$srcrules{$srcrule} .= "$ruleid ";
    }
}
close(GRA);
=cut

# Read in Lattice

if ($lattice =~ m/\.gz/) {
    open(LTC, "gunzip -c $lattice |") or die $!;
} else {
    open(LTC, "$lattice") or die $!;
}

$arccount = 0;
$sentarcs = 0;
$sentindex = 0;
$scoringarcs = 0;

while ($line = <LTC>) {
    $line =~ s/[\r\n]*$//;
    if ($line =~ m/^\((\d+)\s+(\d+)\s+\"([^\"]*)\"\s+(\S+)\s+\"([^\"]*)\"\s+\"(.*?)\"\)\s+:\s+(\S+)$/) {
	($start, $end, $tgt, $oldscore, $src, $trace, $score) = 
	    ($line =~ m/^\((\d+)\s+(\d+)\s+\"([^\"]*)\"\s+(\S+)\s+\"([^\"]*)\"\s+\"\@?(.*?)\"\)\s+:\s+(\S+)$/);
	$arccount++;
	$sentarcs++;
	next if $score < 0.90;  # Have a high threshold for using arc in scoring

	$trace =~ m/^\(?\((\S+)/;
	$toprule = uc($1);

	next if !defined($ruleids{$toprule});

	$totallength += 1+($end-$start);
	$scoringarcs++;

	$toprules{$toprule}++;
	$srcnts{$ruleids{$toprule}}++;

    } elsif ($line =~ m/^\(/) {
	$sentarcs = 0;
	print "Sent: $sentindex\n";
	$sentindex++;
    } elsif ($line =~ m/^\)/) {
	print "  $sentarcs, $scoringarcs, $arccount\n";

    }
}

close(LTC);

# Add-one smoothing
print "With smoothing\n";
foreach $toprule (sort keys %ruleids) {
    $toprules{$toprule}++;
    $srcnts{$ruleids{$toprule}}++;
}


foreach $toprule (sort keys %ruleids) {
    #print "$ruleids{$toprule}\t$toprule\t$toprules{$toprule}\t$srcnts{$ruleids{$toprule}}\n";
    if ($srcnts{$ruleids{$toprule}} != 0) {
	$rulescores{$toprule} = ($toprules{$toprule}/$srcnts{$ruleids{$toprule}});
    } else {
	$rulescores{$toprule} = 0;
	$srcnts{$ruleids{$toprule}} = 0;
    }
    if ($toprules{$toprule} == 0) { $toprules{$toprule} = 0; }
    print "$toprule\t$toprules{$toprule}\t$srcnts{$ruleids{$toprule}}\t$rulescores{$toprule}\n";
}
print "Arc count $arccount, scoring arcs $scoringarcs\n"; #, avg length " . ($totallength/$scoringarcs) . "\n";


print "Writing scored rules to $scoredgrammar\n";
open(GRA, "$grammar") or die $!;
open(SCR, "> $scoredgrammar") or die $!;
while ($line = <GRA>) {
    next if $line =~ m/^\s*;/;
    if ($line =~ m/^\s*\{([^,]+,\d+)\}/) {
	$ruleid= $1;
	if (!defined($rulescores{$ruleid})) { $rulescores{$ruleid} = 0; }
	print SCR $line;
	next;
    }
    if ($line =~ m/\*score\*/) { next; } # Skip previous scores
    if ($line =~ m/\(\s*$/) {
	print SCR $line;
	print SCR "  (*score* " . $rulescores{$ruleid} . ")\n";
	next;
    }

    print SCR $line;
}
close(GRA);
close(SCR);





