#!/usr/local/bin/perl -w -- -*- coding: utf-8 -*-

use IO::Socket::INET;
use Net::hostent;
use FileHandle;
use Expect;

use POSIX ":sys_wait_h";

$morphdir = "/shared/code/Hindi/hindi_chunker/morph";
$converterdir = "/shared/code/Hindi/Converters";

if (scalar(@ARGV) == 1 and $ARGV[0] =~ m/^\d+$/) {
    $serverport = $ARGV[0];
} else {
    $serverport = 3125;
    #$serverport = 3126;
}


%morphcache = ();

$cachecount = 0;

$usecache = 1;

# Load the morph cache
if ($usecache == 1) {
    open(MC, "morphcache.txt") or warn $!;
    $cachecount = 0;
    while (!eof(MC)) {
	$cachecount++;
	$word = <MC>;
	$word =~ s/[\n\r]*$//;
	$morphfs = <MC>;
	$morphfs =~ s/[\n\r]*$//;
	$morphcache{$word} = $morphfs;
    }
    close(MC);
}


%tagsetmap = ("v",	"V",
	      "v_a",	"Aux",
	      "Aux",	"Aux",
	      "RAHA",   "RAHA",
	      "v_n",	"V_n", # Verbal noun
	      "n",	"N",
	      "P",	"Pron",
	      "p",	"Postp",
	      "adj",	"Adj",
	      "Avy",	"Poss",
	      "D",	"Adv",
	      "sh_n",	"Poss", # Shasti Noun
	      "sh_P",	"PossPron" # Shasti Pronoun
	      );

%personmap = ("u", "1",
	      "m", "2",
	      "m_h", "2",
	      "a", "3",
	      "any", "(*OR* 1 2 3)"
	      );

%casemap = ("0", "dir",
	    "1", "obl",
	    "any", "(*OR* dir obl)"
	    );

%gendermap = ("m", "m",
	      "f", "f",
	      "any", "(*OR* m f)"
	      );

%numbermap = ("s", "s",
	      "p", "p",
	      "any", "(*OR* s p)"
	      );

%tammap = ("*yA*", "(aspect perf) (form part) ",
	   "*yA1*", "(aspect perf) (form part) ",
	   "*wA*", "(aspect imperf) (form part) ",
	   "*nA*", "(form inf) ",
	   "*future*", "(tense fut) (form fin) ",
	   "*subj*", "(tense subj) (form fin) ",
	   "*kara*", "(form prog) ",
	   "*0*", "(form root) ",
	   "*WA*", "(tense past) (form fin) ",
	   "*hE*", "(tense pres) (form fin) ",

	   "*Shashthi*", "(tam *Shashthi*) ",
	   "*ne*", "(tam *ne*) ",
	   "*se*", "(tam *se*) ",
	   "*meM*", "(tam *meM*) ",
	   "*ko*", "(tam *ko*) "
	   );

%spellcorrect = ("hEMH\n", "hEM\n");

#  %numbers = ("१", "1",
#  	    "२", "2",
#  	    "३", "3",
#  	    "४", "4",
#  	    "५", "5",
#  	    "६", "6",
#  	    "७", "7",
#  	    "८", "8",
#  	    "९", "9",
#  	    "०", "0"
#  	    );

sub getSpelling {
    my($orig) = shift;
    my($in, $variations);

    $orig =~ s/\n$//;

    $in = $orig;
    $in =~ s/([EeoOiI])M/$1z/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/([aeiouAEIOU])([eiI])$/$1y$2/g;	# y insertion as in gae/gaye
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/M([kKgG])/f$1/g;
    $in =~ s/M([cCjJ])/F$1/g;
    $in =~ s/M([tTdD])/N$1/g;
    $in =~ s/M([wWxX])/n$1/g;
    $in =~ s/M([pPbB])/m$1/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/f([kKgG])/M$1/g;
    $in =~ s/F([cCjJ])/M$1/g;
    $in =~ s/N([tTdD])/M$1/g;
    $in =~ s/n([wWxX])/M$1/g;
    $in =~ s/m([pPbB])/M$1/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/M/z/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/$/H/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/([kctwpKCTWPgjdxbGJDXBfFNnmyrlvsSRh])a$/$1/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/iyAM$/iyAz/;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $in = $orig;
    $in =~ s/Q/q/g;
    if ($in ne $orig) { $variations .= "\t$in"; }

    $variations =~ s/^\t+//;
    return $variations;
}





sub ucs2utf8 {
    my($ucsstring) = @_;
    my($utf8str, $ucschar, $i, $bin1, $bin2, $bin3, $retval);
    my($byteorder) = "le";
    $utf8str = "";
    # Check for byte order mark
    $possbom = substr($ucsstring, 0, 2);
    $binchar = vec($possbom, 0, 16);
    my($start) = 0;
    if ($binchar == 0xfeff) {
	$byteorder = "le";
	$start = 2;
    } elsif ($binchar == 0xfffe) {
	$byteorder = "be";
	$start = 2;
    }

    #print "Byte order is $byteorder\n";
    for ($i = $start; $i < length($ucsstring); $i+=2) {
	$ucschar = substr($ucsstring, $i, 2);  # might need to check big/little endian
	if ($byteorder eq "be") {
	    $ucschar = substr($ucschar, 1, 1) . substr($ucschar, 0, 1);
	} 
	$binchar = vec($ucschar, 0, 16);
	if ($binchar <= 127) {
	    $retval = pack("C", $binchar);
	} elsif ($binchar <= 2047) {
	    $bin1 = ($binchar >> 6) | 0xC0;
	    $bin2 = ($binchar & 0x3F) | 0x80;
	    $retval = pack("C2", $bin1, $bin2);
	} else {
	    $bin1 = ($binchar >> 12) | 0xE0;
	    $bin2 = (($binchar & 0x0FFF) >> 6) | 0x80;
	    $bin3 = ($binchar & 0x003F) | 0x80;
	    $retval = pack("C*", $bin1, $bin2, $bin3);
#	#print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n";
	}
	$utf8str .= $retval;
    }
    return $utf8str;
}

sub utf82ucs {
    my($utfstring) = @_;
    my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3);

    $i = 0;
    while ($i < length($utfstring)) {
	$byte1 = substr($utfstring, $i, 1);
	if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)
	    $unichar = pack("C", 0x00) . $byte1;
	    $i++;
	} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long
	    $byte2 = substr($utfstring, $i+1, 1);
	    $int1 = unpack("C", $byte1) & 0x1F;
	    $int1 <<= 0x06;
	    $int2 = unpack("C", $byte2) & 0x3F;
	    $unival = $int1 | $int2;
	    $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
	    $i += 2;
	} else {  # 3 bytes long
	    $byte2 = substr($utfstring, $i+1, 1);
	    $byte3 = substr($utfstring, $i+2, 1);

	    $int1 = 0x0F & unpack("C", $byte1);
	    $int1 <<= 12;
	    $int2 = 0x3F & unpack("C", $byte2);
	    $int2 <<= 6;
	    $int3 = 0x3F & unpack("C", $byte3);
	    $unival = $int1 | $int2 | $int3;
	    $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
	    $i += 3;
	}
	$unistring .= $unichar;
    }
    $unistring;
}


sub initMorph {
    # Start all the encoding converters
    ($morph = Expect->spawn("$morphdir/anusAraka/hindi/morph/test/morph.out $morphdir/anusAraka/hindi/morph/test $morphdir/anusAraka/hindi/dict/dict_final $morphdir/anusAraka/hindi/test ULDWHF")) || die "Couldn't spawn morph, $!";
    $morph->exp_stty('raw -echo');

    ($uc2i8 = Expect->spawn("./uni2iscii.pl")) || die "Couldn't spawn ucs to iscii8, $!";
    $uc2i8->exp_stty('raw -echo');

    ($i82wx = Expect->spawn("$converterdir/i8_wx/d8_ra_wp_r.out")) || die "Couldn't spawn i8 to wx, $!";
    $i82wx->exp_stty('raw -echo');

    ($wx2i8 = Expect->spawn("$converterdir/wx_i8/ra_d8_wp_r.out")) || die "Couldn't spawn wx to i8, $!";
    $wx2i8->exp_stty('raw -echo');

    ($i82uc = Expect->spawn("$converterdir/i8_uc/iscii-to-unicode-ver2.out big_endian")) || die "Couldn't spawn i8 to utf16, $!";
    $i82uc->exp_stty('raw -echo');

}

$Expect::Log_Stdout = 0;
$Expect::Multiline_Matching = 0;


$| = 1;


$server = IO::Socket::INET->new(LocalPort => $serverport,
				Type => SOCK_STREAM,
				Reuse => 1,
				Listen => 12)
    or die "Couldn't be a tcp server on port $serverport: $!\n";

print "Starting Hindi morphology server on port $serverport; $cachecount words cached\n\n";

#$SIG{CHLD} = 'IGNORE';
$SIG{CHLD} = sub { wait };

$morphinited = 0;

while ($client = $server->accept()) {
    if ($kidpid = fork) {
	close $client;
	next;
    }

    defined($kidpid) or die "cannot fork: $!";

    close $server;
    
    $hostinfo = gethostbyaddr($client->peeraddr);
    printf "[Connect from %s]\n", $hostinfo->name || $client->peerhost;
    if ($hostinfo->name !~ m/cmu.edu$/i && $hostinfo->name !~ m/^localhost$/i) {
	print $client "[Not from CMU.  Closing out connection]\n\n";
	print "[Not from CMU.  Closing out connection]\n\n";
	close($client);
	exit;
    }


  
    $morphcache{"।"} = "()";


    while ($word = <$client>) {
	$word =~ s/[ \-\)\?\|\.\'\,\r\n]*$//;
	$word =~ s/^[\-\(\`\']*//;
	$word =~ s/^‘//;
	$word =~ s/’$//;
	if ($word eq "**EXIT**") {
	    last;
	}

	# Don't bother with empty lines
	if ($word =~ m/^\s*$/) {
	    $morphcache{$word} = "()";
	    print $client "()\n";
	    next;
	}

	print "\n* Processing \"$word\", length " . length($word) . "\n";
	# Keep a cache of results of previously seen words to speed things up
	if (defined($morphcache{$word})) {
	    print "Found in cache: " . $morphcache{$word} . "\n";
	    print $client $morphcache{$word} . "\n";
	    next;
	}
	
	# Don't bother with numbers
	if ($word =~ m/१|२|३|४|५|६|७|८|९|०/) {
	    $morphcache{$word} = "((pos lex) (lex \"$word\"))";
	    print $client "((pos lex) (lex \"$word\"))\n";
	    next;
	}

	if ($morphinited == 0) {
	    &initMorph();
	    $morphinited = 1;
	}


	$wxinput = 0;
	# If all alphabetic, assume it's in WX Roman, otherwise, assume UTF8 and convert to WX
	if ($word =~ m/^[a-zA-Z]+$/) {
	    $wxword = $word . "\n";
	    $wxinput = 1;
	} else {
	    # Check if in UTF-8 (or could be in UTF-8)
	    $isutf8 = 1;
	    for ($byteindex = 0; $byteindex < length($word); $byteindex++) {
		if (vec($word, $byteindex, 8) < 128) {
		    $isutf8 = 0;
		}
	    }
	    if ($isutf8 == 0) {
		$morphcache{$word} = "((pos lex) (lex \"$word\"))";
		print $client "((pos lex) (lex \"$word\"))\n";
		next;
	    }

	    $wxinput = 0;
	    $ucsword = utf82ucs("$word\n");
	    #print "UTF16 output $ucsword";
	    $uc2i8->clear_accum();
	    $uc2i8->send("$ucsword");
	    $uc2i8->expect(1, '-re', '^[^\n]*\r?\n');
	    $i8word = $uc2i8->exp_match();
	    #print "ISCII output $i8word";

	    $i82wx->clear_accum();
	    $i82wx->send($i8word);

	    $i82wx->expect(1, '-re', '^[^\n]+\r?\n');
	    if (defined($i82wx->exp_match())) {
		$wxword = $i82wx->exp_match();
		#print "WX is ", $i82wx->exp_match();
	    } else {
		$morphcache{$word} = "((pos lex) (lex \"$word\"))";
		print $client "((pos lex) (lex \"$word\"))\n";
		next;
	    }
	}
	
	# Fix some common spelling concerns
	$wxword =~ s/Z//g;
	$wxword =~ s/\?//g;
	if ($wxword =~ m/^\s*$/) {
	    $morphcache{$word} = "()";
	    print $client "()\n";
	    next;
	}
	
	
	if (defined($spellcorrect{$wxword})) {
	    $wxword = $spellcorrect{$wxword};
	}

	print "WX-Roman $wxword\n";

	$morph->clear_accum();
	$morph->send("$wxword");

	$morph->expect(1, '-re', '^[^\n]*\r?\n'); #, '-re', 'ROOT.*/', '^\r?$');
	`rm -f uword`;

	if (defined($morph->exp_match())) {
	    $features = $morph->exp_match();
	    if ($features =~ m/^\s*$/) {
		# Erik's spelling correction port
		$variantlist = &getSpelling($wxword);		
		@variants = split(/\t/, $variantlist);
		foreach $variant (@variants) {
		    print "Trying spelling $variant\n";
		    $morph->clear_accum();
		    $features = "";
		    $morph->send("$variant\n");
		    $morph->expect(1, '-re', '^[^\n\r]*\r?\n'); 
		    `rm -f uword`;
		    $features = $morph->exp_match();
		    if ($features !~ m/^\s*$/) {
			last;
		    }
		}     
		if ($features =~ m/^\s*$/) {
		    print "No spelling match\n";
		    $morphcache{$word} = "((pos lex) (lex \"$word\"))";
		    print $client "((pos lex) (lex \"$word\"))\n";
		    next;
		}
	    }
	    print "Morph output is $features";


	    # Process to f-struct
	    if ($features =~ m/^\s*$/) {
		$morphcache{$word} = "((pos lex) (lex \"$word\"))";
		print $client "((pos lex) (lex \"$word\"))\n";
		next;
	    } else {
		(@fstrucs) = ($features =~ m/([^\/]+\/)/g);
		#print "Fstruc size is ", scalar(@fstrucs);
		$output = "";
		foreach $fstruc (@fstrucs) {
		    $fstruc =~ m/_([^\{]+)\{/;
		    $root = $1;
		    if ($fstruc =~ m/CAT:v/ && $fstruc =~ m/TAM:\*yA\*/ && $root eq "raha") {
			#$root = "nA";
			$fstruc =~ s/CAT:v/CAT:RAHA/;
		    } elsif ($fstruc =~ m/CAT:v/ && $root eq "hE") {
			$root = "honA";
			$fstruc =~ s/CAT:v/CAT:Aux/;
		    } elsif ($fstruc =~ m/CAT:v/) {
			$root .= "nA";
		    } 
		    $fstruc =~ m/\{([^\}]+)\}/;
		    (@fvpairs) = split(/\s+/, $1);
		    $output .= "( ";
		    #$wxinput = 0; # used for testing
#		    if ($wxinput) { # If in it came in in WX, send it back out in WX
#			$output .= "(lex $root) ";
#		    } else {
		    # Convert to ISCII8
		    $wx2i8->clear_accum();
		    $i8root = "";
		    #print "Converting WX $root to UTF8\n";
		    $wx2i8->send($root . "\n");
		    $wx2i8->expect(1, '-re', '^[^\n]*\r?\n');
		    $i8root = $wx2i8->exp_match();
		    #print "ISCII8 output $i8root\n";
		    
		    # Convert to UTF16
		    $i82uc->clear_accum();
		    $ucroot = "";
		    $i82uc->send($i8root);
		    $i82uc->expect(1, '-re', '^[^\n]*\r?\n');
		    $ucroot = $i82uc->exp_match();
		    #print "UTF16 output $ucroot";
		    
		    # Convert root to UTF-8
		    $u8root = ucs2utf8($ucroot);
		    #print "UTF8 output $u8root";
		    chomp($u8root);
		    $output .= "(lex $u8root) (lexwx $root) ";
			
#		    }
		    foreach $fvpair (@fvpairs) {
			($feat, $val) = split(/:/, $fvpair);
			if ($feat eq "CAT") {
			    $feat = "pos";
			    $val = $tagsetmap{$val};
			} elsif ($feat eq "number") {
			    $val = $numbermap{$val};
			    $output .= "(agr ((num $val))) ";
			    next;
			} elsif ($feat eq "gender") {
			    $val = $gendermap{$val};
			    $output .= "(agr ((gen $val))) ";
			    next;
			} elsif ($feat eq "person") {
			    $val = $personmap{$val};
			    $output .= "(agr ((pers $val))) ";
			    next;
			} elsif ($feat eq "case") {
			    $val = $casemap{$val};
			} elsif ($feat eq "TAM") {
			    $output .= $tammap{$val};
			    next;
			} elsif ($feat eq "PDGM") {
			    next;
			}
			$output .= "(\L$feat\E $val) ";
		    }
		    $output .= ") ";
		}
		print "Returning $output\n";
		print $client "$output\n";
		$morphcache{$word} = $output;
	    }
	} else {
	    print "Nothing from morphology\n";
	    $morphcache{$word} = "((pos lex) (lex \"$word\"))";
	    print $client "((pos lex) (lex \"$word\"))\n";
	}
    }

    if ($morphinited == 1) {
	$morph->hard_close();
	$uc2i8->hard_close();
	$i82wx->hard_close();
	$wx2i8->hard_close();
	$i82uc->hard_close();
    }

    print "\n[Closing out connection]\n\n";
    close($client);
    exit;
}

close($server);
