#!/usr/bin/perl

# Still need to extend to handle change of POS (nominalizations, etc.)


# ./CreateStemLexicon.pl < ../quechua2spa/lexicons/FreqWordList_00001-00100-irene-corr2.csv >! ../quechua2spa/lexicons/StemLex4Seg

## Postprocessing needed to sort and eliminate duplicates:
#	from: aria@cusco /cygdrive/c/mt/quechua2spa/lexicons
#	$ sort StemLex4Seg | uniq > StemLex4SegSortNoDup 


# Fields in the CVS file are:
# 0.word, 1.stem+suffixes, 2.stem translation, 3.stem POS, 4.word transl, 5.word POS, (6.final stem translation, if there has been change of POS)...

sub setGlobals {

    $POSAllowed = "V N Pron Adj Adv VR Num Vsuff NSuff Suff Abbr Conj Interj";
    %AllowedPOS;
#    @StemLex; # array of arrays
    @fields; # for each LexEntry

}
setGlobals();

sub POSinitialization {
    @POSs = split " ", $POSAllowed;
#   foreach $POS (@POSs) {
#      print STDOUT "POS is now $POS\n";
#   }
    foreach $POS (@POSs) {
	$AllowedPOS{$POS} = 1;
    }
#    print STDOUT "reading from the array of legal POS:\n";
#    foreach $legalPOS (keys(%AllowedPOS)) {
#	print STDOUT "$legalPOS\n";   
#       print STDOUT "$legalPOS $AllowedPOS{$legalPOS}\n";
#    }
}
POSinitialization();


while (<>){
    $line = $_;
#    $line =~ tr/A-Z/a-z/; # POS are in capitals
    @fields = split ",", $line; 

    $stem = $fields[1];
    while ($stem =~ /.\+./) {  # while since there might be more than one suffix attached to the stem
	$stem =~ s/(.+)\+.+/$1/;
    } 

    $POSseq = $fields[3];
#    s/[\.\?\!\t\]\"\`\[\(\)\,\>\<\%\&\;\*\$\#\@\:\+\-\=\\\/\^\_\ģ0-9]/ /g; 
    $POSseq =~ s/\./ /g;  # strip off punctuation

    if ($POSseq =~ /.+\|.+/) { 
	@POSseq = split " ", $POSseq;
    } else {                          # if it has only 1 POS
	push(@POSseq,$POSseq);
    }

    foreach $POS (@POSseq) {
	if ( $AllowedPOS{$POS} ) {
	    print STDOUT $stem;
	    print STDOUT ", $POS";
	    print STDOUT ", ((lex $stem) (pos $POS))\n";
	}
    }
    @POSseq = ();  # reinicialize
}