#!/usr/local/bin/perl

# !!! check input file first, if there are :, need to add to grammar
# right now it assumes they are comments from EC and deletes text preceeding it

# ./input-xfer-preprocessing.pl < ../corpus/elicitation-EN.txt.rest >! ../corpus/elicitation-EN.txt.rest.clean

# !! I need to give it 2 or 3 passes in case a sentence has multiple 
# punctuation marks (or PART + punct)
# ./input-xfer-preprocessing.pl < ../corpus/elicitation-EN.txt.rest.clean >! ../corpus/elicitation-EN.txt.rest.clean.clean

# doublecheck file before inputting it into the xfer engine


# separating PART and punct marks from words
while (<>){
    chomp;
    if ( /(.*\w)\'s(.*)/ ) { # possessive mark (singular)
	print "$1 's $2\n";
    } elsif ( /(.*\w)\'(.*)/ ) { # possessive mark (plural)
	print "$1 ' $2\n";
    } elsif ( /(.*\w)(,.*)/ ){ # \w makes sure we don't separate commas from numbers
	print "$1 $2\n";
    } elsif( /(.*)(-)(.*)/ ){ 
	print "$1 $2 $3\n"; #assumption: xfer engine doesn't have a problem with multiple sapces
    } elsif( /.*:(.*)/ ){ 
	print "$1\n"; # in old EC comments often preceed sentences, ignore
	print STDERR "!\":\" found! deleted text preceeding \":\"\n";
    } elsif ( /(.*)\.$/ ) { 
	print "$1 .\n";
    } elsif ( /(.*)\?$/ ) { 
	print "$1 ?\n";
    } elsif ( /(.*)\!$/ ) { 
	print "$1 !\n";
    } else {
        print $_ . "\n";
    }
}