#!/usr/local/bin/perl -w

# ./postprocess-xfer.out.debug.pl < /usr0/aria/eng2spa/corpus/input-xfer.out.debug > input-tct

# !!!! could not get rid of spaces after "tl: " lines, need to either figure it out or do it manually, so that the TCTool won't create empty boxes at the end!!!

# I deleted the old, original version of this script by accident :(
# the one I used to create input-tct for the English-Spanish user studies

# input looks like this:
=comment
0:
sl: john and mary fell
tl: JUAN Y MARíA CAYERON
tree: <((S,1 (NP,6 (NP,2 (N,4:1 "JUAN") ) (CONJ,0:2 "Y") (NP,2 (N,5:3 "MARíA") )
) (VP,1 (V,17:4 "CAYERON") ) ) )>
=cut
# output should look like this:
#sl: john kicked the ball
#tl: juan chutó la pelota
#al: ((1,1),(2,2),(3,3),(4,4))


# putting input separator in paragraph mode
#$/=""; # doesn't seem to work

# make sure this captures sentences with special characters
while ($line = <>) {
    if ( $line =~/^(sl: .*$)/ ) {
	chomp($line);
	chomp($line);
	print $line . "\n";
    }
    if ($line =~ /^(tl: .*\w)\s+$/ ) {
	$line =~ tr/A-Z/a-z/;
	chomp($line);
	print $line . "\nal: "

    }elsif ($line =~ /^(tl: .*$)/ ) {
	$line =~ tr/A-Z/a-z/;
	chomp($line);
	print $line . "\nal: ";
    }
    # Erik's code from getAlign.pl to extract alignments taking into 
    # account multiunit words
    if ($line =~ m/^tree: /) {
	$tgtindex = 1;
	$alignment = "(";
	(@aligns) = ($line =~ m/(\:\d+\s+\"[^\"]*\")+/g);
	foreach $align (@aligns) {
	    ($srcindex, $lex) = ($align =~ m/\:(\d+)\s+\"([^\"]*)\"/);
	    $alignment .= "($srcindex,";
	    if ($lex =~ m/\s/) {
		(@lexes) = split(/\s+/, $lex);
		foreach $lexitem (@lexes) {
		    $alignment .= "$tgtindex ";
		    $tgtindex++;
		}
		$alignment =~ s/ $/\), /;
	    } else {
		$alignment .= "$tgtindex),";
		$tgtindex++;
	    }
	}
	$alignment =~ s/\,$/\)/; # Remove final comma, add parenthesis
#	print $line;
	print $alignment . "\n";
    }
}