# need to use perl v5.8.8

# Mapudungun morphology wrapper by Erik Peterson, eepeter@cs.cmu.edu
# May, 2005

use strict;

############
# Change these variables to match local situation
############
#my($javaexec) = "/usr/bin/java";

my($javaexec) = "/afs/cs.cmu.edu/user/cmonson/java"; # or just "java" if in your path

my($dirdelim) = ":";                                     # or ";" for Windows

# dir with jar/parse subdirs
my($morphdir) = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Mapu-MT/Morph/"; 

############

use IO::Socket::INET;
use Net::hostent;
use FileHandle;
use File::Temp "tempfile";
use Encode;

use POSIX ":sys_wait_h";

use IPC::Open2;
use IPC::Open3;

use POSIX qw(locale_h);
setlocale(LC_CTYPE, "es_ES");
setlocale(LC_COLLATE, "es_ES");

my(@classpaths);
@classpaths = ("");
$ENV{'CLASSPATH'} = join $dirdelim, @classpaths;


# Auto-flush
$| = 1;

my($TRUE)  = 1;
my($FALSE) = 0;

my($HUMAN_READABLE) = $FALSE;
my($SERVERPORT) = 5781;

parseCommandLine(@ARGV);


# Open server port
my($server);
$server = IO::Socket::INET->new(LocalPort => $SERVERPORT,
				Type => SOCK_STREAM,
				Reuse => 1,
				Listen => 12)
    or die "Couldn't be a tcp server on port $SERVERPORT: $!\n";

print STDOUT "\nStarting PERL Mapudungun morphology analysis server on port $SERVERPORT\n"; #; $cachecount words cached\n";

#$SIG{CHLD} = sub { wait };
$SIG{CHLD} = 'IGNORE';

# start up the Mapudungun morphology analyzer
my($jarfile)        = $morphdir . "activelyWorking/MapudungunMorphologyAnalysis/mapudungunMorphologyAnalysis/MapudungunMorphologyAnalysis.jar";
#my($jarfile)        = $morphdir . "stable/MapudungunMorphologyAnalysis.jar";

my($stemDataFile)   = $morphdir . "MapudungunStemLexicon.txt";
my($suffixDataFile) = $morphdir . "MapudungunSuffixLexicon.txt";
#my($stemDataFile)   = $morphdir . "activelyWorking/MapudungunStemLexicon-Azumchefi-Reconciled.txt";
#my($suffixDataFile) = $morphdir . "activelyWorking/MapudungunSuffixLexicon-Azumchefi-withSpanishGlosses.txt";

my(@args);
@args = ("-jar", $jarfile, $stemDataFile, $suffixDataFile);

# The pipe way that doesn't work in Red Hat 9 -- This is perl's fault
#my($childpid, $childsOut, $childsIn);
#my($childsErr);
#$childpid = open2($childsOut, $childsIn, $javaexec, @args)
#    or die "can't open pipe to $javaexec: $!";
#$childpid = open3($childsIn, $childsOut, $childsErr, $javaexec, @args)
#    or die "can't open pipe to $javaexec: $!";


my($host) = `echo \$HOST`;
chomp $host;
print STDERR "Host: $host\n";

my($portToJavaMapuMorphology) = $SERVERPORT + 1;
print STDERR "In Perl: Port between Perl and Java: $portToJavaMapuMorphology\n";

# See "Programming Perl" Ch. 16 Interprocess Communication, Pipes (p. 426) for using
#   open to start a child process that the parent process doesn't block for the return of.
#   I never actually write to STARTTHEJAVA (because there is no way for the child Java
#   morphology analyzer to write back the analyzed Mapudunugn word.)  But this is the
#   best way I could find to start a child process in java that won't block.  The ``
#   operator, for example, blocks until the child process (and all the child process's 
#   descendent processes return.)
open STARTTHEJAVA, "| $javaexec -jar $jarfile $stemDataFile $suffixDataFile $portToJavaMapuMorphology";

my($sleepCounter) = 0;
my($socketToJavaMapuMorphology) = $FALSE;
while ( ! $socketToJavaMapuMorphology) {
    $sleepCounter++;
    if ($sleepCounter <= 20) {
        # Go to sleep in the perl because it takes a moment for the Java morphology analyzer server to get
        #   set up.
	print STDERR "\nTry #$sleepCounter (of a max of 20) to connect from Perl to Java . . .\n";
	sleep(1);
    } else {
	die "\n\nCounldn't connect from Perl to Java: $host:$portToJavaMapuMorphology : $!\n\n\n";
    }
    $socketToJavaMapuMorphology = IO::Socket::INET->new(PeerAddr => $host,
							PeerPort => $portToJavaMapuMorphology,
							Proto    => "tcp",
							Type     => SOCK_STREAM);
}

print STDERR "\n\nConnected from Perl to Java\n\n";

binmode $socketToJavaMapuMorphology, ":utf8";
#binmode $socketToJavaMapuMorphology, ":encoding(latin1)";


# The f-structures that are passed to the transfer engine
# need to keep track of the span indexes of each morpheme or unit
# that is associated with a particular f-structure.  And since
# each word in a sentence starts where the previous word ended,
# $spanTagBase keeps track of where the span index is in the sentence.
my($spanTagBase);
my($allUniqueCompleteFStructures);


my($client, $kidpid);
my($sentence);
my(@words);
my($wordPlusNewLine);
my($resultFromMorphologyAnalyzer);
my($toWriteToClient);
# Wait for connections
while ($client = $server->accept()) {
    if ($kidpid = fork) {
	close $client;
	next;
    }

    defined($kidpid) or die "cannot fork: $!";

    close $server;

    print "[Connection established to the #### PERL #### Mapudungun Morphology Server]\n";

    # Read Mapudungun sentence
    while ($sentence = <$client>) {

	$sentence = decode("utf-8", $sentence);

	if ($sentence eq "**EXIT**") {
	    last;
	}

	$toWriteToClient = "";

	# Process input, read analyzed form, convert to feature structure
	# ...

	print "Processing incoming sentence: |$sentence|\n";

	$spanTagBase = 0;
	$allUniqueCompleteFStructures = "";

	# remove final newlines etc.
	chomp $sentence;
	
	# Split on whitespace
	@words = split /\s+/, $sentence;
	
	foreach my $word (@words) {
	    
	    # The java Mapudungun morphology analyzer uses readLine() which requires an end-of-line
	    #   character after each input
	    $wordPlusNewLine = $word . "\n";
	    
	    print "The input that is being sent to the java morphology analyzer is : |$wordPlusNewLine|\n";
	    
	    # Send the input to the java morphology analyzer
	    print $socketToJavaMapuMorphology $wordPlusNewLine;
	    flush $socketToJavaMapuMorphology;

	    print "got past the input being sent to the java morphology analyzer\n";
	    
	    # Read back the result of the analysis
	    $resultFromMorphologyAnalyzer = <$socketToJavaMapuMorphology>;
	    #$resultFromMorphologyAnalyzer = <$socketToJavaMapuMorphology>;
	    
	    print "The result from the analyzer is : |";
	    print $resultFromMorphologyAnalyzer;
	    print "|\n";
	    
	    if ($HUMAN_READABLE) {

		$toWriteToClient .= doHumanReadable($resultFromMorphologyAnalyzer);

	    } else {  # produce machine interpretable output

		my($uniqueMorphemeFStructuresWithTheirSpanTags);
		($uniqueMorphemeFStructuresWithTheirSpanTags, $spanTagBase) = 
		    doMachineReadable($resultFromMorphologyAnalyzer, $word, $spanTagBase);
		$toWriteToClient .= $uniqueMorphemeFStructuresWithTheirSpanTags;

	    }
	}

	
	if ( ! $HUMAN_READABLE) {
	    # post-processing to get output into the exact form that the transfer engine expects
	    print $toWriteToClient;
	    $toWriteToClient =~ s/\s+/ /g;
	    $toWriteToClient =~ s/\" \)/\"\)/;
	    $toWriteToClient =~ s/\( LEX\s+\)/\( LEX \"\"\)/;
	    
	    $toWriteToClient =~ s/[\r\n]*$/ /;
	    $toWriteToClient =~ s/\( LEX /\( lex /;
	    $toWriteToClient =~ s/\( POS /\( pos /;
	}

	if ( ! $HUMAN_READABLE) {
	    print "\nThe Final f-structure of this sentence is:\n\n\t";
	} else {
	    print "\nThe Human readable morphological analysis is:\n\n";
	}
	print "$toWriteToClient\n\n";
	

	$toWriteToClient = encode("utf-8", $toWriteToClient);

	# Send feature structures back to xfer engine OR human readable text back to client
	# that needs human readable text.
	print $client "$toWriteToClient\n";
	
    }

    print "[Connection closed]\n";
    close($client);
    exit;
}

print "I should never get here in the perl morphology analyzer wrapper because I loop forever\n";

sub doMachineReadable {
    my($resultFromMorphologyAnalyzer, $word, $spanTagBase) = @_;

    # If the morphology analyzer couldn't analyze this word return a simple f-structure
    my($uniqueMorphemeFStructuresWithTheirSpanTags);
    if ($resultFromMorphologyAnalyzer =~ /^\s*$/) {
	$uniqueMorphemeFStructuresWithTheirSpanTags = "( ( SPANSTART $spanTagBase ) ( SPANEND " 
	    . ($spanTagBase + 1) 
	    . " ) ( lex " . uc($word) . " ) ( pos lex ) ) ";
	$spanTagBase++;
    } else {
	($uniqueMorphemeFStructuresWithTheirSpanTags, $spanTagBase) = 
	    getUniqueMorphemeFStructuresAndAssignTheirSpanTags($resultFromMorphologyAnalyzer, 
							       $word,
							       $spanTagBase);
    }
    print "After assigning Span Tags the f-structures are : |";
    print $uniqueMorphemeFStructuresWithTheirSpanTags;
    print "|\n";
    
    return ($uniqueMorphemeFStructuresWithTheirSpanTags, $spanTagBase);
}

sub doHumanReadable {
    my($resultFromMorphologyAnalyzer) = @_;

    my($humanReadable) = "";

    my($lex, $pos, $Sp);

    if ($resultFromMorphologyAnalyzer =~ /^\s*$/) {
	return "<small>lo siento, pero aún no puedo analizar esta palabra</small>\n";
    }

    # The analyses returned from the Mapudungun java-implemented analyzer
    # are semi-colon separated
    my(@analyses) = split /\s*;\s*/, $resultFromMorphologyAnalyzer;

    foreach my $analysis (@analyses) {

	print STDOUT "\nDoing the analysis: $analysis\n\n";

	my(@morphemes) = split /\)\s*,\s*\(/, $analysis;  # can't split just on , because the spanish-gloss may have commas in it

	my($firstMorpheme) = $TRUE;
	foreach my $morpheme (@morphemes) {
	    print STDOUT "   morpheme: $morpheme\n";
	    $morpheme =~ /\(\s*lex \s*(\S+)\s*\)/;
	    $lex = $1;
	    if ($firstMorpheme) {
		$firstMorpheme = $FALSE;
	    } else {
		$humanReadable .= "-";
	    }
	    $humanReadable .= $lex;
	}
	$humanReadable .= " \n";    # an extra return between analyses but \n\n means end of morphology to the server
	                            # so the space in " \n" is very important

	$firstMorpheme = $TRUE;
	foreach my $morpheme (@morphemes) {
	    $morpheme =~ /\(\s*lex \s*(\S+)\s*\)/;
	    $lex = $1;
	    $humanReadable .= "  $lex";
	    if ($firstMorpheme) {
		$firstMorpheme = $FALSE;
		$morpheme =~ /\(\s*pos \s*(\S+)\s*\)/;
		$pos = $1;
		$humanReadable .= ", $pos";

		$morpheme =~ /\(\s*spanish \s*(.+?)\s*\)/;  # +? is the non-greedy '+' quantifier 
		$Sp = $1;
		$humanReadable .= ", $Sp\n";
	    } else {
		$morpheme =~ /\(\s*spanish-gloss \s*(.+?)\s*\)/;  # +? is the non-greedy '+' quantifier
		$Sp = $1;
		$humanReadable .= ", $Sp\n";
	    }
	}
	$humanReadable .= " \n";  # an extra return between analyses but \n\n means end of morphology to the server
	                          # so the space in " \n" is very important
    }
    $humanReadable .= "\n\n";

    return $humanReadable;
}

sub getUniqueMorphemeFStructuresAndAssignTheirSpanTags {
    my($rawFStructures, 
       $word,
       $spanBase) = @_;

    my($DEBUG) = 1;

    if ($DEBUG > 0 ) {
	print STDOUT "In getUniqueMorphemeFStructuresAndAssignTheirSpanTags()...\n";
	print STDOUT "The word whose analyses we are (hopefully) examining is : |$word|\n";
	print STDOUT "\nthe raw f-structures passed to this function all as one string are: \n";
	print STDOUT "\t|$rawFStructures|\n\n";
    }

    my($wordLength) = length($word);

    # The analyses returned from the Mapudungun java-implemented analyzer
    # are semi-colon separated
    my(@analyses) = split /\s*;\s*/, $rawFStructures;

    if ($DEBUG > 0) {
	print STDOUT "Separated, the analyses of the word $word are : \n";
	foreach my $analysis (@analyses) {
	    print STDOUT "\t|$analysis|\n\n";
	}
    }

    # an array of arrays
    # The first index corrosponds to the index in @analyses
    # The second index identifies (an F-structure of) a morpheme within that analysis 
    my(@morphemeFStructuresByAnalysis) = ();
    my(@morphemeGraphsByAnalysis);

    # An array of arrays
    # The first index corrosponds to indexes in @analyses
    # The second index initially corrosponds to a character index into the orginal $word
    #   but the MAIN function of this subroutine is to splice out indexes in @spanStartPositions
    #   which do not have any morpheme in any analysis that begins from that index
    my(@spanStartPositions) = ();

    my($FStructures) = "";
    my(%uniqueFStructures) = ();
    
    my($morphemeFStructure, $morphemeGraph);
    my($analysis);
    for (my $row = 0; $row < scalar(@analyses); $row++) {
	$analysis = $analyses[$row];

	if ($DEBUG > 0) {
	    print STDOUT "Separating out the morphemes in the analysis:\n\t$analysis\n\n";
	}

	@{$morphemeFStructuresByAnalysis[$row]} = split /\s*,\s*/, $analysis;
	for (my $morphemeIndex = 0; 
	     $morphemeIndex < scalar(@{$morphemeFStructuresByAnalysis[$row]}); 
	     $morphemeIndex++) {
	    
	    $morphemeFStructure = $morphemeFStructuresByAnalysis[$row][$morphemeIndex];

	    if ($DEBUG > 0) {
		print STDOUT "The f-structure of a single morpheme in this analysis:\n\t$morphemeFStructure\n\n";
	    }

	    $morphemeFStructure =~ /\(\s*lex\s+(\S+)\s*\)/; 
	    $morphemeGraph = $1;
	    $morphemeGraphsByAnalysis[$row][$morphemeIndex] = $morphemeGraph;
	}
    }

    if ($DEBUG > 0) {
	for (my $row = 0; $row < scalar(@analyses); $row++) {
	$analysis = $analyses[$row];

	print STDOUT "The morphemes in the analysis:\n\t$analysis\n\t";
	
	for (my $morphemeIndex=0; 
	     $morphemeIndex<scalar(@{$morphemeFStructuresByAnalysis[$row]}); 
	     $morphemeIndex++) {

	    print STDOUT "|", $morphemeGraphsByAnalysis[$row][$morphemeIndex], "| ";
	}
	print STDOUT "\n\n";
    }
	
    }
    
    # Initialize every character position in $spanStartPositions to hold an empty string
    for (my $col = 0; $col < $wordLength; $col++) {
	for (my $row = 0; $row < scalar(@analyses); $row++) {
	    $spanStartPositions[$row][$col] = "";
	}
    }
    
    # Set the array values at the character index where each morpheme starts
    my($charIndex);
    for (my $row = 0; $row < scalar(@analyses); $row++) {
	$charIndex = 0;
	for (my $col = 0; $col < @{$morphemeGraphsByAnalysis[$row]}; $col++) {
	    $spanStartPositions[$row][$charIndex] = $morphemeGraphsByAnalysis[$row][$col];
	    $charIndex += length($morphemeGraphsByAnalysis[$row][$col]);
	}
    } 

    # Collapse empty indices
    my($allempty);
    my($beyondEndOfSpanStartPositions) = $FALSE;
    for (my $col = 0; $col < @{$spanStartPositions[0]}; $col++) {
	$allempty = $TRUE;

	# Loop here in case several empty rows in a row:
	#   When we splice out a $col we don't want to increment
	#   $col since the entire array actually moved to the left for us
	do {
	    #print "Looking for empty space $col\n";

	    # Special case exit: If all analyses end in morphemes
	    #  that are several characters long, then when we remove
	    #  those indexes from the rows of $spanStartPositions
	    #  we may find ourselves off the end of the 
	    #  $spanStartPositions array
	    if ($col >= scalar(@{$spanStartPositions[0]})) {
		$beyondEndOfSpanStartPositions = $TRUE;
	    }
	    last if $beyondEndOfSpanStartPositions;
	    
	    # if NO analysis contains an morpheme that begins at this $col
	    for (my $row = 0; $row < @spanStartPositions; $row++) {
		if ($spanStartPositions[$row][$col] ne "") {
		    $allempty = 0;
		}
	    }
	    # then splice out this $col
	    if ($allempty) {
		for (my $row = 0; $row < @spanStartPositions; $row++) {
		    splice @{$spanStartPositions[$row]}, $col, 1;
		}
	    }
	} while ($allempty);
    }
    
    # From @spanStartPositions calculate the SPANSTART and SPANEND tags 
    # And insert these tags into the f-structure for that morpheme
    my($morphemeIndex);
    my($spanStart);
    my($FStructure);
    my($adjustedSpanStart, $adjustedSpanEnd);
    for (my $row = 0; $row < @spanStartPositions; $row++) {
	$morphemeIndex = 0;
	for (my $colInSpanStartPositions = 0; $colInSpanStartPositions < @{$spanStartPositions[$row]}; ) {
	    $spanStart = $colInSpanStartPositions;
	    $colInSpanStartPositions++;
	    while ($colInSpanStartPositions < @{$spanStartPositions[$row]} 
		   and $spanStartPositions[$row][$colInSpanStartPositions] eq "") {
		$colInSpanStartPositions++;
	    }
	    
	    $FStructure = $morphemeFStructuresByAnalysis[$row][$morphemeIndex];
	    $adjustedSpanStart = $spanBase + $spanStart;
	    $adjustedSpanEnd   = $spanBase + $colInSpanStartPositions;
	    $FStructure =~ s/^\s*\(/\( \( SPANSTART $adjustedSpanStart \) \( SPANEND $adjustedSpanEnd \)/;
	    $uniqueFStructures{$FStructure} = 1;
	    
	    $morphemeIndex++;
	}	    
    }

    foreach my $FStructure (sort byindex keys %uniqueFStructures) {
	$FStructures .= $FStructure . " ";
    }

    $spanBase += scalar(@{$spanStartPositions[0]});

    return ($FStructures, $spanBase);
}

# Send $fss back to xfer engine
# ...

sub byindex {
    my($astart, $aend) = ($a =~ m/SPANSTART\s+(\d+).*?SPANEND\s+(\d+)/i);
    my($bstart, $bend) = ($b =~ m/SPANSTART\s+(\d+).*?SPANEND\s+(\d+)/i);

    $astart <=> $bstart or
	$aend <=> $bend;
}

sub parseCommandLine {
    my(@argv) = @_;

    my($index) = 0;
    while ($index < scalar(@argv)) {
	if ($argv[$index] =~ /^((-h)|(-help))$/) {
	    print STDERR "argv[$index] = " . $argv[$index] . "\n";
	    printUsageAndExit();
	} elsif ($argv[$index] =~ /^-port$/) {
	    $index++;
	    if ($index < scalar(@argv)) {
		if ($argv[$index] =~ /^\d+/) {
		    $SERVERPORT = $argv[$index];
		} else {
		    print STDERR "\nThe -port flag requires a numeric port to be specified.\n";
		    print STDERR "Instead " . $argv[$index] . " was specified\n\n";
		    printUsageAndExit();
		}
	    } else {
		print STDERR "\nThe -port flag requires a numeric port to be specified.\n";
		print STDERR "Instead nothing followed the -port flag\n\n";
		printUsageAndExit();
	    }
	} elsif ($argv[$index] =~ /^-humanReadable$/) {
	    $HUMAN_READABLE = $TRUE;
	}

	$index++;
    }
}

sub printUsageAndExit {
    print STDERR "\nUSAGE: perl mapu_morph.pl [-port <port-number>] [-humanReadable]\n\n";
    print STDERR "  The default port number is: 5781\n";
    print STDERR "  Use the -humanReadable flag if the output is to be interpreted by humans.\n";
    print STDERR "    The default is machine readable which sports SPAN tags and *lots* of parenthses\n\n";
    print STDERR "  Exiting...\n\n";
    exit(0);
}