#!/usr/bin/perl

# Still need to extend to handle change of POS (nominalizations, etc.)

# ./MakePOSList.pl < ../quechua2spa/lexicons/FreqWordList_00001-00100-irene-corr2.csv >! ../quechua2spa/lexicons/POS-list.txt


sub setGlobals {

    $POSAllowed = "V N Pron Adj Adv VR Num Vsuff NSuff Suff Abbr Conj Interj";
    %AllowedPOS;
#    @StemLex; # array of arrays
    @fields; # for each LexEntry

}
setGlobals();

sub POSinitialization {
    @POSs = split " ", $POSAllowed;
#   foreach $POS (@POSs) {
#      print STDOUT "POS is now $POS\n";
#   }
    foreach $POS (@POSs) {
	$AllowedPOS{$POS} = 1;
    }
#    print STDOUT "reading from the array of legal POS:\n";
#    foreach $legalPOS (keys(%AllowedPOS)) {
#	print STDOUT "$legalPOS\n";   
#       print STDOUT "$legalPOS $AllowedPOS{$legalPOS}\n";
#    }
}
POSinitialization();

# Fields in the CVS file are:
# 0.word, 1.stem+suffixes, 2.stem translation, 3.stem POS, 4.word transl, 5.word POS, (6.final stem translation, if there has been change of POS)...
while (<>){
    $line = $_;
#    $line =~ tr/A-Z/a-z/; # POS are in capitals
    @fields = split ",", $line; 

    $stem = $fields[1];
    while ($stem =~ /.\+./) {  # while since there might be more than one suffix attached to the stem
	$stem =~ s/(.+)\+.+/$1/;
    } 

    $Tseq = $fields[2]; # stem translation(s)
#    $Tseq =~ s/\./ /g;  # strip off punctuation
    if ( $Tseq =~ /.+\(.*\)/ ) {  # if there are parents
	$Tseq =~ s/(.+)\(.*\)/$1/; # take them out together with its contents
    }
    if ($Tseq =~ /.+\|\|.+/) { 
	@Tseq = split "||", $Tseq;
    } elsif ($Tseq =~ /.+\|.+/) { 
	@Tseq = split " ", $Tseq;
    } else {                          # if it has only 1 POS
	push(@Tseq,$Tseq);
    }

# hash{$POS} = $list of translations
# when parsing POS, if I run into | 


    $POSseq = $fields[3];
#    s/[\.\?\!\t\]\"\`\[\(\)\,\>\<\%\&\;\*\$\#\@\:\+\-\=\\\/\^\_\»0-9]/ /g; 
    $POSseq =~ s/\./ /g;  # strip off punctuation

    if ($POSseq =~ /.+\|.+/) { 
	@POSseq = split " ", $POSseq;
    } else {                          # if it has only 1 POS
	push(@POSseq,$POSseq);
    }

    foreach $Tr (@Tseq) {
#	foreach $POS (@POSseq) {
#	    if ( $AllowedPOS{$POS} ) {	
	if ( ($Tr ne "|") && ($Tr ne "||") ) {
	print STDOUT "@POSseq[0] ";   # first approximation, just 1 POS, later one entry for each POS and each translation 
	print STDOUT $stem;
	print STDOUT " $Tr\n";
	}
    }
#}
@POSseq = ();  # reinicialize
@Tseq= ();
}






