#!/usr/bin/perl -w

# ./extract-sl-field.pl <  ../corpus/StructuralCorpus3-corrected.nodups.et > ../corpus/SEC.txt

# for Quechua, the tgtsent is actually the SL...

# extracting 1. Quechua (w/o segmentation marks)
#            2. segmented Quechua (by native speaker)
#            3. Spanish Translation
while ($line = <>) {
    if ( $line =~/^srcsent: (.*)$/ ) {
	$SpTr = $1;
	chomp($SpTr);
    } elsif ( $line =~/^tgtsent: (.*)$/ ) {
	$sl = $1;
	chomp($sl);
	$segQ = $sl;
	$segQ =~ s/\+/ /g; # strip off segmentation marks
	# right now the SL Quechua is left with the suffixes segmented, since
	# Christian's segmentation module for Quechua is not ready yet
#	$Q = $sl;
#	$Q =~ s/\+//g;
#	print STDOUT "$Q\t$segQ\t$SpTr\n";
#	print STDOUT "$segQ\t$SpTr\n";
	print STDOUT "$segQ\n";
    }
}


