#!/usr/local/bin/perl

die "Need name of file to use\n" unless @ARGV;

#$refset = "/afs/cs.cmu.edu/user/joy/Eval/Chinese/gale07dev/asrbc-ref.sgm";
#$refset = "/afs/cs.cmu.edu/user/joy/Eval/Chinese/gale07dev/asrbn-ref.sgm";
$refset = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Transfer/Chinese/mt03_chinese_evlset_v0-ref.sgm";


# Create appropriate ref file for each sent
sub readSentRefs {
    my($i);
    my($sentcount); # = 27;
    
    open(REF, $refset) or die $!;
    while ($line = <REF>) {
	$line =~ s/[\r\n]*$//;
	if ($line =~ m/sysid=\"(\w+)\"/) {
	    $sysid = $1;
	}
	if ($line =~ m/<seg/i) {
	    $line =~ s/<seg(\s+id=\"?\d+\"?)?>\s*//i;
	    $line =~ s/\s*<\/seg>//i;
	    push @{$refsets{$sysid}}, $line;
	}
    }
    close(REF);

    foreach $sysid (sort keys %refsets) { 
	$sentcount = scalar(@{$refsets{$sysid}});
    }

    @sysids = sort keys %refsets;
    $sysid = $sysids[0];

    for ($sentnum = 0; $sentnum < $sentcount; $sentnum++) {
	$refseg = $refsets{$sysid}[$sentnum];
	@words = split(/\s+/, $refseg);
	foreach $word (@words) {
	    $refwords{lc($word)}++;
	    $refdiff{lc($word)}++;
	}
    }
}

&readSentRefs;

$arg = $ARGV[0];

open(TRANS, $arg) or die $!;
while ($line = <TRANS>) {
    if ($line =~ m/^\d+\s+0\t(.+)$/) {
	$trans = $1;
	@words = split(/\s+/, $trans);
	foreach $word (@words) {
	    $transwords{lc($word)}++;
	    $refdiff{lc($word)}--;
	}
    }
}
close(TRANS);



print "Word\tDiff\tRefCount\n";
foreach $refword (sort {$refdiff{$b} <=> $refdiff{$a} } keys %refwords) {
    #next if $lexfreq{$lexid} == 1;
    print "$refword\t$refdiff{$refword}\t$refwords{$refword}\n";
}
