#!/usr/local/bin/perl -w

use IO::Socket::INET;
use Net::hostent;
use FileHandle;
use Expect;

use POSIX ":sys_wait_h";

$morphdir = "/shared/code/Hindi/hindi_chunker/morph";
$converterdir = "/shared/code/Hindi/Converters";

if (scalar(@ARGV) == 1 and $ARGV[0] =~ m/^\d+$/) {
    $serverport = $ARGV[0];
} else {
    $serverport = 3125;
}


%tagsetmap = ("v",	"V",
	      "n",	"N",
	      "P",	"Pron",
	      "p",	"Postp",
	      "adj",	"Adj",
	      "Avy",	"avy",
	      "D",	"Adv",
	      "sh_n",	"Poss",
	      "sh_P",	"sh_P" # ??
	      );



sub ucs2utf8 {
    my($ucsstring) = @_;
    my($utf8str, $ucschar, $i, $bin1, $bin2, $bin3, $retval);
    my($byteorder) = "le";
    $utf8str = "";
    # Check for byte order mark
    $possbom = substr($ucsstring, 0, 2);
    $binchar = vec($possbom, 0, 16);
    my($start) = 0;
    if ($binchar == 0xfeff) {
	$byteorder = "le";
	$start = 2;
    } elsif ($binchar == 0xfffe) {
	$byteorder = "be";
	$start = 2;
    }

    #print "Byte order is $byteorder\n";
    for ($i = $start; $i < length($ucsstring); $i+=2) {
	$ucschar = substr($ucsstring, $i, 2);  # might need to check big/little endian
	if ($byteorder eq "be") {
	    $ucschar = substr($ucschar, 1, 1) . substr($ucschar, 0, 1);
	} 
	$binchar = vec($ucschar, 0, 16);
	if ($binchar <= 127) {
	    $retval = pack("C", $binchar);
	} elsif ($binchar <= 2047) {
	    $bin1 = ($binchar >> 6) | 0xC0;
	    $bin2 = ($binchar & 0x3F) | 0x80;
	    $retval = pack("C2", $bin1, $bin2);
	} else {
	    $bin1 = ($binchar >> 12) | 0xE0;
	    $bin2 = (($binchar & 0x0FFF) >> 6) | 0x80;
	    $bin3 = ($binchar & 0x003F) | 0x80;
	    $retval = pack("C*", $bin1, $bin2, $bin3);
#	#print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n";
	}
	$utf8str .= $retval;
    }
    return $utf8str;
}

sub utf82ucs {
    my($utfstring) = @_;
    my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3);

    $i = 0;
    while ($i < length($utfstring)) {
	$byte1 = substr($utfstring, $i, 1);
	if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)
	    $unichar = pack("C", 0x00) . $byte1;
	    $i++;
	} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long
	    $byte2 = substr($utfstring, $i+1, 1);
	    $int1 = unpack("C", $byte1) & 0x1F;
	    $int1 <<= 0x06;
	    $int2 = unpack("C", $byte2) & 0x3F;
	    $unival = $int1 | $int2;
	    $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
	    $i += 2;
	} else {  # 3 bytes long
	    $byte2 = substr($utfstring, $i+1, 1);
	    $byte3 = substr($utfstring, $i+2, 1);

	    $int1 = 0x0F & unpack("C", $byte1);
	    $int1 <<= 12;
	    $int2 = 0x3F & unpack("C", $byte2);
	    $int2 <<= 6;
	    $int3 = 0x3F & unpack("C", $byte3);
	    $unival = $int1 | $int2 | $int3;
	    $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
	    $i += 3;
	}
	$unistring .= $unichar;
    }
    $unistring;
}

%morphcache = ();

$Expect::Log_Stdout = 0;
$Expect::Multiline_Matching = 0;


$| = 1;


$server = IO::Socket::INET->new(LocalPort => $serverport,
				Type => SOCK_STREAM,
				Reuse => 1,
				Listen => 10)
    or die "Couldn't be a tcp server on port $serverport: $!\n";

print "Starting Hindi morphology server on port $serverport\n\n";

#$SIG{CHLD} = 'IGNORE';
$SIG{CHLD} = sub { wait };


while ($client = $server->accept()) {
    if ($kidpid = fork) {
	close $client;
	next;
    }

    defined($kidpid) or die "cannot fork: $!";

    close $server;
    

    # Start all the encoding converters
    ($morph = Expect->spawn("$morphdir/anusAraka/hindi/morph/test/morph.out $morphdir/anusAraka/hindi/morph/test $morphdir/anusAraka/hindi/dict/dict_final $morphdir/anusAraka/hindi/test ULDWHF")) || die "Couldn't spawn morph, $!";
    $morph->exp_stty('raw -echo');

    ($uc2i8 = Expect->spawn("./uni2iscii.pl")) || die "Couldn't spawn ucs to iscii8, $!";
    $uc2i8->exp_stty('raw -echo');

    ($i82wx = Expect->spawn("$converterdir/i8_wx/d8_ra_wp_r.out")) || die "Couldn't spawn i8 to wx, $!";
    $i82wx->exp_stty('raw -echo');

    ($wx2i8 = Expect->spawn("$converterdir/wx_i8/ra_d8_wp_r.out")) || die "Couldn't spawn wx to i8, $!";
    $wx2i8->exp_stty('raw -echo');

    ($i82uc = Expect->spawn("$converterdir/i8_uc/iscii-to-unicode-ver2.out big_endian")) || die "Couldn't spawn i8 to utf16, $!";
    $i82uc->exp_stty('raw -echo');
  
    

    $hostinfo = gethostbyaddr($client->peeraddr);
    printf "[Connect from %s]\n", $hostinfo->name || $client->peerhost;    

    while ($word = <$client>) {
	$word =~ s/[ \r\n]*$//;
	if ($word eq "**EXIT**") {
	    last;
	}
	print "\n* Processing \"$word\", length " . length($word) . "\n";
	# Keep a cache of results of previously seen words to speed things up
	if (defined($morphcache{$word})) {
	    print "Found in cache: " . $morphcache{$word} . "\n";
	    print $client $morphcache{$word} . "\n";
	    next;
	}
	# Don't bother with empty lines
	if ($word =~ m/^\s*$/) {
	    print $client "()\n";
	    next;
	}


	$wxinput = 0;
	# If all alphabetic, assume it's in WX Roman, otherwise, assume UTF8 and convert to WX
	if ($word =~ m/^[a-zA-Z]+$/) {
	    $wxword = $word . "\n";
	    $wxinput = 1;
	} else {
	    # Check if in UTF-8 (or could be in UTF-8)
	    $isutf8 = 1;
	    for ($byteindex = 0; $byteindex < length($word); $byteindex++) {
		if (vec($word, $byteindex, 8) < 128) {
		    $isutf8 = 0;
		}
	    }
	    if ($isutf8 == 0) {
		print $client "()\n";
		next;
	    }

	    $wxinput = 0;
	    $ucsword = utf82ucs("$word\n");
	    #print "UTF16 output $ucsword";
	    $uc2i8->send("$ucsword");
	    $uc2i8->expect(1, '-re', '^[^\n]*\r?\n');
	    $i8word = $uc2i8->exp_match();
	    #print "ISCII output $i8word";

	    $i82wx->send($i8word);

	    $i82wx->expect(1, '-re', '^[^\n]+\r?\n');
	    if (defined($i82wx->exp_match())) {
		$wxword = $i82wx->exp_match();
		#print "WX is ", $i82wx->exp_match();
	    } else {
		print $client "()\n";
		next;
	    }
	}
	
	print "WX-Roman $wxword\n";

	$morph->send("$wxword");

	$morph->expect(1, '-re', '^[^\n]*\r?\n'); #, '-re', 'ROOT.*/', '^\r?$');
	if (defined($morph->exp_match())) {
	    print "Morph output is ", $morph->exp_match();
	    $features = $morph->exp_match();
	    if ($features =~ m/^\s*$/) {
		print $client "()\n";
		next;
	    }

	    # Process to f-struct
	    if ($features =~ m/^\s*$/) {
		print $client "()\n";
	    } else {
		(@fstrucs) = ($features =~ m/([^\/]+\/)/g);
		#print "Fstruc size is ", scalar(@fstrucs);
		$output = "";
		foreach $fstruc (@fstrucs) {
		    $fstruc =~ m/_([^\{]+)\{/;
		    $root = $1;
		    $fstruc =~ m/\{([^\}]+)\}/;
		    (@fvpairs) = split(/\s+/, $1);
		    $output .= "( ";
		    if ($fstruc =~ m/CAT:v/) {
			$root .= "nA";
		    }
		    #$wxinput = 0; # used for testing
		    if ($wxinput) { # If in it came in in WX, send it back out in WX
			$output .= "(lex $root) ";
		    } else {
			# Convert to ISCII8
			#print "Converting WX $root to UTF8\n";
			$wx2i8->send($root . "\n");
			$wx2i8->expect(1, '-re', '^[^\n]+\r?\n');
			$i8root = $wx2i8->exp_match();
			#print "ISCII8 output $i8root";

			# Convert to UTF16
			$i82uc->send($i8root);
			$i82uc->expect(1, '-re', '^[^\n]+\r?\n');
			$ucroot = $i82uc->exp_match();
			#print "UTF16 output $ucroot";

			# Convert root to UTF-8
			$u8root = ucs2utf8($ucroot);
			#print "UTF8 output $u8root";
			chomp($u8root);
			$output .= "(lex $u8root) ";
			
		    }
		    foreach $fvpair (@fvpairs) {
			($feat, $val) = split(/:/, $fvpair);
			if ($feat eq "CAT") {
			    $feat = "pos";
			    $val = $tagsetmap{$val};
			}
			$output .= "(\L$feat\E $val) ";
		    }
		    $output .= ") ";
		}
		print "Returning $output\n";
		print $client "$output\n";
		$morphcache{$word} = $output;
	    }
	} else {
	    print "Nothing from morphology\n";
	    print $client "()\n";
	    $morphcache{$word} = "()";
	}
    }

    $morph->hard_close();
    $uc2i8->hard_close();
    $i82wx->hard_close();
    $wx2i8->hard_close();
    $i82uc->hard_close();

    print "[Closing out connection]\n\n";
    close($client);
    exit;
}

close($server);
