#!/usr/bin/perl -w

# ./ExtractTestSet.pl <  /usr0/aria/quechua2spa/corpora/TestSets/FC.testset-Q-segQ-Sp.txt  > /usr0/aria/quechua2spa/corpora/FC-TestSet-seg.txt

# InFile: Unsegmented Quechua \t Segmented Quechua \t Spanish translation

# can extract either segmented or unsegmented Quechua, separating punctuation and leaving everything
# in small caps

while (<>) {
    s/\./ \./g;
    s/\,/ \,/g;
    s/\?/ \?/g;
    s/\!\ !//g;
    s/\"/ \" /g;
    s/\`/ \` /g;
    s/\]/ \]/g;
    s/\[/\[ /g;
    s/\(/\( /g;
    s/\)/ \)/g;
    s/\>/ \> /g;
    s/\</ \< /g;
    s/\%/ \%/g;
    s/\*/ \* /g;
    s/\$/\$ /g;
    s/\#/ \# /g;
    s/\@/ \@ /g;
    s/\:/ \: /g;
    s/\;/ \; /g;
    s/\+/ \+ /g;
    s/\-/ \- /g;
    s/\=/ \= /g;
    s/\\/ \\ /g;
    s/\// \/ /g;
    s/\^/ \^ /g;
    s/\_/ \_ /g;
    s/\-/ \- /g; # comment if there are words with a hyphen!

    $line = $_;
   $line =~ tr/A-Z/a-z/;
   $line =~ tr/Ñ/ñ/;

   @fields = split "\t", $line;

#   $Quechua = $fields[0];
   $SegmentedQuechua = $fields[1];
#   $SpanishTranslation = $fields[2];

   print STDOUT "$SegmentedQuechua\n";


}


