#!/usr/bin/perl

# Still need to extend to handle change of POS (nominalizations, etc.)

# ./MakePOSList.pl < ../quechua2spa/lexicons/FreqWordList_00001-00100-irene-corr2.csv >! ../quechua2spa/lexicons/POS-list.txt


sub setGlobals {

    $POSAllowed = "V N Pron Adj Adv VR Num Vsuff NSuff Suff Abbr Conj Interj";
    %AllowedPOS;
#    @StemLex; # array of arrays
    @fields; # for each LexEntry
    @Tseq;
    %Ts;

}
setGlobals();

sub POSinitialization {
    @POSs = split " ", $POSAllowed;
#   foreach $POS (@POSs) {
#      print STDOUT "POS is now $POS\n";
#   }
    foreach $POS (@POSs) {
	$AllowedPOS{$POS} = 1;
    }
#    print STDOUT "reading from the array of legal POS:\n";
#    foreach $legalPOS (keys(%AllowedPOS)) {
#	print STDOUT "$legalPOS\n";   
#       print STDOUT "$legalPOS $AllowedPOS{$legalPOS}\n";
#    }
}
POSinitialization();

# Fields in the CVS file are:
# 0.word, 1.stem+suffixes, 2.stem translation, 3.stem POS, 4.word transl, 5.word POS, (6.final stem translation, if there has been change of POS)...
while (<>){
    $line = $_;
#    $line =~ tr/A-Z/a-z/; # POS are in capitals
    @fields = split ",", $line; 


# THE STEM ##############################################################################
    $stem = $fields[1];
    while ($stem =~ /.\+./) {  # while since there might be more than one suffix attached to the stem
	$stem =~ s/(.+)\+.+/$1/;
    } 

# THE POSSIBLE TRANSLATIONS (possibly for different POSs) ###############################
    $Tseq = $fields[2]; # stem translation(s)
    $diffPOS = 0;
#    $Tseq =~ s/\./ /g;  # strip off punctuation
    if ( $Tseq =~ /.+\(.*\)/ ) {  # if there are parents
	$Tseq =~ s/(.+)\(.*\)/$1/; # take them out together with its contents
    }
    # first storing chunks between ||, might contain | inside...
    if ($Tseq =~ /.+\|\|.+/) { 
	$diffPOS = 1;
	@Tseq = split(/\|{2}/,$Tseq);     #! [split "||", $tseq] doesn't work
	foreach $Tr (@Tseq) {
	    $Tr =~ s/\s*([a-z\s]+)\s*/$1/; # eliminating blank spaces
	    print STDOUT "splitting ||: [$Tr]\n";
	}
    # then storing | partitions (if there are no || partititons)
    }  elsif ($Tseq =~ /.+\|.+/) {
	@Tseq = split(/\|/,$Tseq);
	foreach $Tr (@Tseq) {
	    $Tr =~ s/\s*([a-z\s]+)\s*/$1/; # eliminating blank spaces around translation
	    print STDOUT "simple splitting |: [$Tr]\n";
	}	
    # if it has only 1 POS	
       } else {    
	   $Tseq =~ s/\s*([a-z\s]+)\s*/$1/; # eliminating blank spaces
	   print STDOUT "unique translation: [$Tseq]\n";       
	   push(@Tseq,$Tseq);
       }
# debugging
    if ($diffPOS == 1) {
	print STDOUT "it has different POS (diffPOS = $diffPOS):\n";
	foreach $Tr (keys (%Ts)) {
	    $tr = $Ts{$Tr};
	    print STDOUT "[$tr]\n";
	}	    
    } else {
	print STDOUT "it has the same POSs for all translations:\n";
	foreach $Tr (@Tseq) {
	    print STDOUT "[$Tr] ";
	}
	print STDOUT "\n";
    }


# hash{$POS} = $list of translations
# when parsing POS, if I run into | 

=comment
    $POSseq = $fields[3];
#    s/[\.\?\!\t\]\"\`\[\(\)\,\>\<\%\&\;\*\$\#\@\:\+\-\=\\\/\^\_\ģ0-9]/ /g; 
    $POSseq =~ s/\./ /g;  # strip off punctuation

    if ($POSseq =~ /.+\|.+/) { 
	@POSseq = split " ", $POSseq;
    } else {                          # if it has only 1 POS
	push(@POSseq,$POSseq);
    }

    foreach $Tr (@Tseq) {

#!!!!!!!!!!!!!
# still need to check if there are |, since chunks between || were left w/o splitting 
#	$lenT = @Tseq;
#	for ($i=0; $i < $lenT; $i++) {     # and inside them, | artitions, if any
#	    $tseq = @Tseq[$i];
#	    if ($tseq =~ /.+\|.+/) { 
#		@tseq = split(/\|/, $tseq);  #! [split "|", $tseq] doesn't work
#		foreach $tr (@tseq) {
#		    $tr =~ s/\s*([a-z\s]+)\s*/$1/; # eliminating blank spaces
#		    print STDOUT "\tsplitting | inside ||: [$tr]\n";
#		}

#	foreach $POS (@POSseq) {
#	    if ( $AllowedPOS{$POS} ) {	
#	if ( ($Tr ne "|") && ($Tr ne "||") ) {
	print STDOUT "@POSseq[0] ";   # first approximation, just 1 POS, later one entry for each POS and each translation 
	print STDOUT $stem;
	print STDOUT " $Tr\n";
	}
    }
#}
=cut
@POSseq = ();  # reinicialize
@Tseq= ();
}