#!/usr/local/bin/perl

# Mapudungun morphology wrapper by Erik Peterson, eepeter@cs.cmu.edu
# May, 2005

############
# Change these variables to match local situation
############
$javaexec = "/afs/cs.cmu.edu/user/cmonson/java"; # or just "java" if in your path

$dirdelim = ":";                                     # or ";" for Windows

# dir with jar/parse subdirs
$morphdir = "/afs/cs.cmu.edu/project/avenue-1/Avenue/Mapu-MT/Morph/"; 

############

use IO::Socket::INET;
use Net::hostent;
use FileHandle;
use File::Temp "tempfile";
#use Encode;

use POSIX ":sys_wait_h";

use IPC::Open2;

use POSIX qw(locale_h);
setlocale(LC_CTYPE, "es_ES");
setlocale(LC_COLLATE, "es_ES");


@classpaths = ("");
$ENV{'CLASSPATH'} = join $dirdelim, @classpaths;

# Default server port number is 5781; 
# Include port number as argument if want different port
if (scalar(@ARGV) == 1 and $ARGV[0] =~ m/^\d+$/) {
    $serverport = $ARGV[0];
} else {
    $serverport = 5781;
}

# Auto-flush
$| = 1;

my($TRUE)  = 1;
my($FALSE) = 0;


# Open server port
$server = IO::Socket::INET->new(LocalPort => $serverport,
				Type => SOCK_STREAM,
				Reuse => 1,
				Listen => 12)
    or die "Couldn't be a tcp server on port $serverport: $!\n";

print "Starting Mapudungun morphology server on port $serverport\n"; #; $cachecount words cached\n";

#$SIG{CHLD} = sub { wait };
$SIG{CHLD} = 'IGNORE';

# start up the Mapudungun morphology analyzer
$jarfile        = $morphdir . "stable/MapudungunMorphologyAnalysis.jar";


$stemDataFile   = $morphdir . "MapudungunStemLexicon.txt";
$suffixDataFile = $morphdir . "MapudungunSuffixLexicon.txt";

print STDOUT "\nStarting up Mapudungun morphology analyzer\n\n";

@args = ("-jar", $jarfile, $stemDataFile, $suffixDataFile);

$childpid = open2($childsOut, $childsIn, $javaexec, @args)
    or die "can't open pipe to $javaexec: $!";

# The f-structures that are passed to the transfer engine
# need to keep track of the span indexes of each morpheme or unit
# that is associated with a particular f-structure.  And since
# each word in a sentence starts where the previous word ended,
# $spanTagBase keeps track of where the span index is in the sentence.
my($spanTagBase);
my($allUniqueCompleteFStructures);


# Wait for connections
while ($client = $server->accept()) {
    if ($kidpid = fork) {
	close $client;
	next;
    }

    defined($kidpid) or die "cannot fork: $!";

    close $server;

    print "[Connection established]\n";

    # Read Mapudungun sentence
    while ($sentence = <$client>) {
	if ($sentence eq "**EXIT**") {

	    print "\nExplicitly exiting...\n\n";

	    last;
	}

	# Process input, read analyzed form, convert to feature structure
	# ...

	print "Processing incoming sentence: |$sentence|\n";

	$spanTagBase = 0;
	$allUniqueCompleteFStructures = "";

	# remove final newlines etc.
	chomp $sentence;
	
	# Split on whitespace
	@words = split /\s+/, $sentence;
	
	foreach $word (@words) {
	    
	    # The java Mapudungun morphology analyzer uses readLine() which requires an end-of-line
	    #   character after each input
	    $wordPlusNewLine = $word . "\n";
	    
	    print "The input that is being sent to the java morphology analyzer is : |$wordPlusNewLine|\n";
	    
	    # Send the input to the child java morphology analyzer
	    print $childsIn $wordPlusNewLine;

	    print "got past sending the the word to the java morphological analyzer\n";
	    
	    # Read back the result of the analysis
	    $resultFromMorphologyAnalyzer = <$childsOut>;
	    
	    print "The result from the analyzer is : |";
	    print $resultFromMorphologyAnalyzer;
	    print "|\n";
	    
	    # If the morphology analyzer couldn't analyze this word return a simple f-structure
	    if ($resultFromMorphologyAnalyzer =~ /^\s*$/) {
		$uniqueMorphemeFStructuresWithTheirSpanTags = "( ( SPANSTART $spanTagBase ) ( SPANEND " 
		    . ($spanTagBase + 1) 
		    . " ) ( lex " . uc($word) . " ) ( pos lex ) ) ";
		$spanTagBase++;
	    } else {
		($uniqueMorphemeFStructuresWithTheirSpanTags, $spanTagBase) = 
		    getUniqueMorphemeFStructuresAndAssignTheirSpanTags($resultFromMorphologyAnalyzer, 
								       $word,
								       $spanTagBase);
	    }
	    print "After assigning Span Tags the f-structures are : |";
	    print $uniqueMorphemeFStructuresWithTheirSpanTags;
	    print "|\n";

	    $allUniqueCompleteFStructures .= $uniqueMorphemeFStructuresWithTheirSpanTags;
	}

	

	# post-processing to get output into the exact form that the transfer engine expects
	print $allUniqueCompleteFStructures;
	$allUniqueCompleteFStructures =~ s/\s+/ /g;
	$allUniqueCompleteFStructures =~ s/\" \)/\"\)/;
	$allUniqueCompleteFStructures =~ s/\( LEX\s+\)/\( LEX \"\"\)/;
	
	$allUniqueCompleteFStructures =~ s/[\r\n]*$/ /;
	$allUniqueCompleteFStructures =~ s/\( LEX /\( lex /;
	$allUniqueCompleteFStructures =~ s/\( POS /\( pos /;

	print "\nThe Final f-structure of this sentence is:\n\n\t";
	print "$allUniqueCompleteFStructures\n\n";

        # Send feature structures back to xfer engine
        print $client "$allUniqueCompleteFStructures\n";
    }

    print "[Connection closed]\n";
    close($client);
    exit;
}


print "I should never get here in the perl morphology analyzer wrapper because I loop forever\n";

use strict;

sub getUniqueMorphemeFStructuresAndAssignTheirSpanTags {
    my($rawFStructures, 
       $word,
       $spanBase) = @_;

    my($DEBUG) = 1;

    if ($DEBUG > 0 ) {
	print STDOUT "In getUniqueMorphemeFStructuresAndAssignTheirSpanTags()...\n";
	print STDOUT "The word whose analyses we are (hopefully) examining is : |$word|\n";
	print STDOUT "\nthe raw f-structures passed to this function all as one string are: \n";
	print STDOUT "\t|$rawFStructures|\n\n";
    }

    my($wordLength) = length($word);

    # The analyses returned from the Mapudungun java-implemented analyzer
    # are comma separated
    my(@analyses) = split /\s*;\s*/, $rawFStructures;

    if ($DEBUG > 0) {
	print STDOUT "Separated, the analyses of the word $word are : \n";
	foreach my $analysis (@analyses) {
	    print STDOUT "\t|$analysis|\n\n";
	}
    }

    # an array of arrays
    # The first index corrosponds to the index in @analyses
    # The second index identifies (an F-structure of) a morpheme within that analysis 
    my(@morphemeFStructuresByAnalysis) = ();
    my(@morphemeGraphsByAnalysis);

    # An array of arrays
    # The first index corrosponds to indexes in @analyses
    # The second index initially corrosponds to a character index into the orginal $word
    #   but the MAIN function of this subroutine is to splice out indexes in @spanStartPositions
    #   which do not have any morpheme in any analysis that begins from that index
    my(@spanStartPositions) = ();

    my($FStructures) = "";
    my(%uniqueFStructures) = ();
    
    my($morphemeFStructure, $morphemeGraph);
    my($analysis);
    for (my $row = 0; $row < scalar(@analyses); $row++) {
	$analysis = $analyses[$row];

	if ($DEBUG > 0) {
	    print STDOUT "Separating out the morphemes in the analysis:\n\t$analysis\n\n";
	}

	@{$morphemeFStructuresByAnalysis[$row]} = split /\s*,\s*/, $analysis;
	for (my $morphemeIndex = 0; 
	     $morphemeIndex < scalar(@{$morphemeFStructuresByAnalysis[$row]}); 
	     $morphemeIndex++) {
	    
	    $morphemeFStructure = $morphemeFStructuresByAnalysis[$row][$morphemeIndex];

	    if ($DEBUG > 0) {
		print STDOUT "The f-structure of a single morpheme in this analysis:\n\t$morphemeFStructure\n\n";
	    }

	    $morphemeFStructure =~ /\(\s*lex\s+(\S+)\s*\)/; 
	    $morphemeGraph = $1;
	    $morphemeGraphsByAnalysis[$row][$morphemeIndex] = $morphemeGraph;
	}
    }

    if ($DEBUG > 0) {
	for (my $row = 0; $row < scalar(@analyses); $row++) {
	$analysis = $analyses[$row];

	print STDOUT "The morphemes in the analysis:\n\t$analysis\n\t";
	
	for (my $morphemeIndex=0; 
	     $morphemeIndex<scalar(@{$morphemeFStructuresByAnalysis[$row]}); 
	     $morphemeIndex++) {

	    print STDOUT "|", $morphemeGraphsByAnalysis[$row][$morphemeIndex], "| ";
	}
	print STDOUT "\n\n";
    }
	
    }
    
    # Initialize every character position in $spanStartPositions to hold an empty string
    for (my $col = 0; $col < $wordLength; $col++) {
	for (my $row = 0; $row < scalar(@analyses); $row++) {
	    $spanStartPositions[$row][$col] = "";
	}
    }
    
    # Set the array values at the character index where each morpheme starts
    my($charIndex);
    for (my $row = 0; $row < scalar(@analyses); $row++) {
	$charIndex = 0;
	for (my $col = 0; $col < @{$morphemeGraphsByAnalysis[$row]}; $col++) {
	    $spanStartPositions[$row][$charIndex] = $morphemeGraphsByAnalysis[$row][$col];
	    $charIndex += length($morphemeGraphsByAnalysis[$row][$col]);
	}
    } 

    # Collapse empty indices
    my($allempty);
    my($beyondEndOfSpanStartPositions) = $FALSE;
    for (my $col = 0; $col < @{$spanStartPositions[0]}; $col++) {
	$allempty = $TRUE;

	# Loop here in case several empty rows in a row:
	#   When we splice out a $col we don't want to increment
	#   $col since the entire array actually moved to the left for us
	do {
	    #print "Looking for empty space $col\n";

	    # Special case exit: If all analyses end in morphemes
	    #  that are several characters long, then when we remove
	    #  those indexes from the rows of $spanStartPositions
	    #  we may find ourselves off the end of the 
	    #  $spanStartPositions array
	    if ($col >= scalar(@{$spanStartPositions[0]})) {
		$beyondEndOfSpanStartPositions = $TRUE;
	    }
	    last if $beyondEndOfSpanStartPositions;
	    
	    # if NO analysis contains an morpheme that begins at this $col
	    for (my $row = 0; $row < @spanStartPositions; $row++) {
		if ($spanStartPositions[$row][$col] ne "") {
		    $allempty = 0;
		}
	    }
	    # then splice out this $col
	    if ($allempty) {
		for (my $row = 0; $row < @spanStartPositions; $row++) {
		    splice @{$spanStartPositions[$row]}, $col, 1;
		}
	    }
	} while ($allempty);
    }
    
    # From @spanStartPositions calculate the SPANSTART and SPANEND tags 
    # And insert these tags into the f-structure for that morpheme
    my($morphemeIndex);
    my($spanStart);
    my($FStructure);
    my($adjustedSpanStart, $adjustedSpanEnd);
    for (my $row = 0; $row < @spanStartPositions; $row++) {
	$morphemeIndex = 0;
	for (my $colInSpanStartPositions = 0; $colInSpanStartPositions < @{$spanStartPositions[$row]}; ) {
	    $spanStart = $colInSpanStartPositions;
	    $colInSpanStartPositions++;
	    while ($colInSpanStartPositions < @{$spanStartPositions[$row]} 
		   and $spanStartPositions[$row][$colInSpanStartPositions] eq "") {
		$colInSpanStartPositions++;
	    }
	    
	    $FStructure = $morphemeFStructuresByAnalysis[$row][$morphemeIndex];
	    $adjustedSpanStart = $spanBase + $spanStart;
	    $adjustedSpanEnd   = $spanBase + $colInSpanStartPositions;
	    $FStructure =~ s/^\s*\(/\( \( SPANSTART $adjustedSpanStart \) \( SPANEND $adjustedSpanEnd \)/;
	    $uniqueFStructures{$FStructure} = 1;
	    
	    $morphemeIndex++;
	}	    
    }

    foreach my $FStructure (sort byindex keys %uniqueFStructures) {
	$FStructures .= $FStructure . " ";
    }

    $spanBase += scalar(@{$spanStartPositions[0]});

    return ($FStructures, $spanBase);
}

# Send $fss back to xfer engine
# ...

sub byindex {
    my($astart, $aend) = ($a =~ m/SPANSTART\s+(\d+).*?SPANEND\s+(\d+)/i);
    my($bstart, $bend) = ($b =~ m/SPANSTART\s+(\d+).*?SPANEND\s+(\d+)/i);

    $astart <=> $bstart or
	$aend <=> $bend;
}

no strict;

