#!/usr/local/bin/perl -w -- -*- coding: utf-8 -*-

use strict;
use warnings;

use IO::Socket::INET;
use Net::hostent;
use FileHandle;
use File::Temp "tempfile";
use File::Path;

use POSIX ":sys_wait_h";

# $morphdir = "/shared/code/Hindi/hindi_chunker/morph";
# $converterdir = "/shared/code/Hindi/Converters";

my $serverport = 3125;

$serverport = $ARGV[0] if ($ARGV[0] =~ m/^\d+$/);



my %morphcache = ();
my $cachecount = 0;
my $usecache = 1;

# Load the morph cache
if ($usecache == 1) {
    open(MC, "morphcache.txt") or warn $!;
    $cachecount = 0;
    while (!eof(MC)) {
	$cachecount++;
	my $word = <MC>;
	$word =~ s/[\n\r]*$//;
	my $morphfs = <MC>;
	$morphfs =~ s/[\n\r]*$//;
	$morphcache{$word} = $morphfs;
    }
    close(MC);
}


$| = 1;


my $server = IO::Socket::INET->new(LocalPort => $serverport,
				Type => SOCK_STREAM,
				Reuse => 1,
				Listen => 12)
    or die "Couldn't be a tcp server on port $serverport: $!\n";

print "Starting Hindi morphology server on port $serverport; $cachecount words cached\n\n";

#$SIG{CHLD} = 'IGNORE';
$SIG{CHLD} = sub { wait };

while (my $client = $server->accept()) {
	my $kidpid = fork;
	if ($kidpid) {
		close $client;
		next;
	}
	
	defined($kidpid) or die "cannot fork: $!";
	
	close $server;

	my $hostinfo = gethostbyaddr($client->peeraddr);
	printf "[Connect from %s]\n", $hostinfo->name || $client->peerhost;
	if ($hostinfo->name !~ m/cmu.edu$/i && $hostinfo->name !~ m/^localhost$/i) {
		print $client "[Not from CMU.  Closing out connection]\n\n";
		print "[Not from CMU.  Closing out connection]\n\n";
		close($client);
		exit;
	}
	
	$morphcache{"।"} = "()";


	while (my $sent = <$client>) {
		print $sent,"\n";
		if ($sent eq "**EXIT**") {
			last;
		}
		$sent =~ s/"/'/g;
		$sent = normalizeText($sent);
		my $neSent = getNETags($sent);
		print "Shouting that ne sent is empty |$neSent| |$sent|\n" if($neSent eq '');
		my @words = split /\s+/,$sent;
		my @neWords = split /\s+/, $neSent;
		my $fss = "";
		print "Morphcache has ",keys %morphcache," keys\n";
		for(my $i = 0; $i <= $#words; $i++){
			my $word = $words[$i];
			my $neword = $neWords[$i];
			my $spanstart = $i;
			my $spanend = $i+1;
		
			# Don't bother with empty lines
			if ($word =~ m/^\s*$/) {
				print STDERR "Something wrong ! |$word $i|\n";
				$morphcache{$word} = "()";
				$fss .= "( (spanstart $spanstart) (spanend $spanend) ) ";
				next;
			}
			
			print "\n* Processing \"$word\", length " . length($word) . "\n";
			# Keep a cache of results of previously seen words to speed things up
			if (defined($morphcache{$word})) {
				print "Found in cache: " . $morphcache{$word} . "\n";
				$fss .= "( (spanstart $spanstart) (spanend $spanend) $morphcache{$word} ) ";
				next;
			}
			
			# Handle punctuations
			if ($word =~ m/^[,\.;\?\!\(\)\-:"']+$/) {
# 				$word = '\"' if($word eq '"');
				$morphcache{$word} = "(pos PUNCT) (lex \"$word\")";
				$fss .= "( (spanstart $spanstart) (spanend $spanend) (pos PUNCT) (lex \"$word\") (trans \"$word\") ) ";
				print $fss,"\n";
				next;
			}
			# Convert numerals
			if ($word =~ m/^[١٢٣٤٥٦٧٨٩٠0123456789۰۱۲۳۴۵۶۷۸۹\.,]+$/) {
				my $cword = convertNumeral($word);
				$morphcache{$word} = "(pos CD) (lex \"$word\") (trans \"$cword\")";
				$fss .= "( (spanstart $spanstart) (spanend $spanend) (pos CD) (lex \"$word\") (trans \"$cword\") ) ";
				next;
			}
			if($neword =~ /__[BI]-(PER|ORG)$/){
				$neword =~ s/__[BI]-(PER|ORG)$//;
# 				$morphcache{$word} = "(pos NNP) (lex \"$neword\") (trans \"$neword\")";
				print "( (spanstart $spanstart) (spanend $spanend) (pos NNP) (lex \"$word\") (trans \"$neword\") )";
				$fss .= "( (spanstart $spanstart) (spanend $spanend) (pos NNP) (lex \"$neword\") ) ";
				next; # (trans \"$neword\")
			}
			else{
				my $posRef = getPOS($word);
# 				$morphcache{$word} = "(pos LEX) (lex \"$word\")";
# 				$fss .= "( ";
				print $word,"\n";
				foreach my $pos (@{$posRef}){
					if($pos eq 'V'){
						my $root = getVerbRoot($word);
						print "( (spanstart $spanstart) (spanend $spanend) (lex \"$root\") (pos $pos) )";
						$fss .= "( (spanstart $spanstart) (spanend $spanend) (lex \"$root\") (pos $pos) )";
						next;
					}
					print "( (spanstart $spanstart) (spanend $spanend) (lex \"$word\") (pos $pos) )";
					$fss .= "( (spanstart $spanstart) (spanend $spanend) (lex \"$word\") (pos $pos) )";
				}
# 				$fss .= ' ) ';
			}
		}
		$fss = lc($fss);
# 		print $fss,"\n";
		print $client $fss,"\n";
	}
	print "\n[Closing out connection]\n\n";
	close($client);
	exit;
}

close($server);

BEGIN{
sub getVerbRoot{
	my $word = shift;
	my $root = $word;
# 	$root = $1.'نا' if($word =~ /^(.+)نے$/);
# 	$root = $1.'نا' if($word =~ /^(.+)یا$/);
	return $root;
}
}


BEGIN{
my %dict = ();
my %posmap = (NOUN => 'N',OTHER => 'LEX',ADV => 'RB',PRON => 'PRP',VERB => 'V',FW => 'FW',ADJ => 'JJ',CONJ => 'CC',NUM => 'CD',WHQ => 'WH',POST => 'IN', DET => 'DET');

# open(PFILE,"</afs/cs/project/avenue-1/Avenue/Urdu-MT/data/mono-pos-lexicon") or die("Didn't find the pos lexicon\n");
open(PFILE,"</afs/cs/project/avenue-1/Avenue/Urdu-MT/data/pos.lex") or die("Didn't find the pos lexicon\n");
my %tags = ();
while(<PFILE>){
	chomp;
	$_ =~ s/[,\.]//g;
	my @tokens = split / \| /;
	my @ptags = split /\s+/,$tokens[1];
# 	print $tokens[0],"\n",@posmap{@ptags},"\n";
	push @{$dict{$tokens[0]}}, @posmap{@ptags};
	foreach my $tag (@ptags){
		$tags{$tag} = 1;
	}
}
close PFILE;

open(PFILE,"</afs/cs/project/avenue-1/Avenue/Urdu-MT/data/post") or die("Didn't find the post lexicon\n");
while (<PFILE>){
	chomp;
	my @tokens = split /\s+/;
	@{$dict{$tokens[0]}} = ('IN');
}
close PFILE;

open(PFILE,"</afs/cs/project/avenue-1/Avenue/Urdu-MT/data/cc") or die("Didn't find the cc lexicon\n");
while (<PFILE>){
	chomp;
	my @tokens = split /\s+/;
	@{$dict{$tokens[0]}} = ('CC');
}
close PFILE;

open(PFILE,"</afs/cs/project/avenue-1/Avenue/Urdu-MT/data/det") or die("Didn't find the det lexicon\n");
while (<PFILE>){
	chomp;
	my @tokens = split /\s+/;
	@{$dict{$tokens[0]}} = ('DET');
}
close PFILE;

# foreach (keys %tags){
# 	print $_,"\n";
# }

sub getPOS{
	my $word = shift;
# 	print $word,"\n";
	unless(defined $dict{$word}){
		my @arr = ('LEX');
		return \@arr;
	}
# 	print $word,"\n";
	return \@{$dict{$word}};
}
}

sub normalizeText{
	my $sent = shift;
	
# 	print $sent,"---\n";
	chomp($sent);
	$sent =~ s/[\r\n]*//;

	# Convert the punctuation
	$sent =~ s/\x{060C}/,/g;
	$sent =~ s/\x{061F}/?/g;
	$sent =~ s/\x{061B}/;/g;
	$sent =~ s/\x{06D4}/./g;
	$sent =~ s/\x{066A}/%/g;
	$sent =~ s/\x{066B}/./g;
	$sent =~ s/\x{066C}/,/g;
	
# 	$sent =~ s/([\-\)\(\?\|\.\'\,\"])/ $1 /g;

	$sent =~ s/([\)\(\?\|\"\!;])/ $1 /g;
	
	$sent =~ s/(\d)(\-)/$1 $2 /g;

	$sent =~ s/(\w)('s\s)/$1 $2/g;
	$sent =~ s/(\w)(')\s/$1 $2 /g;
	$sent =~ s/(^|\s)(')(\w)/$1 $2 $3/g;
	
# 	s/(\d+) , (000)/$1$2/g; # Undo earlier mistake in normalization
	$sent =~ s/([^0-9])([,\.])/$1 $2 /g;
	$sent =~ s/([,\.])([^0-9])/ $1 $2/g;

	$sent =~ s/^\s+//;
	$sent =~ s/\s+$//;
# 	print "|$sent|\n";
	return $sent;
}

BEGIN {
my $set = 'tune';
my %necache = ();
my $neFile = "necache.$set";
if(-e $neFile){
	print "Reading ne cache\n";
	open(CFILE,$neFile) or die("Couldn't read the cache file\n");
	while(<CFILE>){
		chomp;
		my $file = <CFILE>;
		chomp($file);
		$necache{$_} = $file if(-e $file);
	}
	close CFILE;
}
mkpath("/tmp/$set") unless(-d "/tmp/$set");

sub getNETags{
	my $sent = shift;
	unless (defined($necache{$sent})){
		(my $tmpfd, my $tmpName) = tempfile("neXXXXXXXXX",DIR => "/tmp/$set",SUFFIX => '.txt', UNLINK => '1');
		print $tmpfd $sent;
		close $tmpfd;
		my $outFile = $tmpName.'.out';
		$necache{$sent} = $outFile;
		open(CFILE,">>$neFile") or die("Couldn't open the necache file\n");
		print CFILE "$sent\n$outFile\n";
		close(CFILE);
		print "Executing /afs/cs/project/avenue-1/Avenue/Urdu-MT/code/ne-tagger/run_ner.sh < $tmpName > $outFile";
		`/afs/cs/project/avenue-1/Avenue/Urdu-MT/code/ne-tagger/run_ner.sh < $tmpName > $outFile 2> /dev/null`;
	}
	open(IN,$necache{$sent}) or die("Unable to open temp file\n");
	my $rsent = "";
	while(<IN>){
		print $_;
		chomp();
		$rsent = $_;#romanizeNames($_);
	}
	close IN;
	return $rsent;
}

END{
	print "Here\n";
	if(%necache){
	open(CFILE,">$neFile") or die("Couldn't open the necache file\n");
	foreach my $sent (keys %necache){
		print CFILE "$sent\n$necache{$sent}\n";
# 		print "$sent\n$necache{$sent}\n";
	}
	close(CFILE);
	}

}
}

BEGIN {
my $path = "/afs/cs/project/avenue-1/Avenue/Urdu-MT/code/romanize";
my $ruleFile = "$path/romanization-mappings.txt";
my %rules = ();
open(RFILE, "<:encoding(utf-8)",$ruleFile) or die("Couldn't open the file $ruleFile\n");

while(<RFILE>){
	chomp;
	(my $urd, my $eng) = split /\-/;
	$urd =~ s/_0x/}\\x{/g;
	$urd =~ s/^0x/\\x{/;
	$urd .= '}';
	$rules{$urd} = $eng;
}
close RFILE;
# print keys %rules,"\n";
my %dict = ();
sub romanizeNames{
	my $line = shift;
	
	my $rtext = "";
	$line =~ s/(\x{06D4})/ $1 /g; # seperate out urdu sentence marker
	my @words = split /\s+/,$line;
	foreach my $word (@words){
		my $ew = $word;
		unless($ew =~ /-PER$/){
			# For now throw away all the tags other than person names
			$ew =~ s/__[BI]-(.+)$//;
			$rtext .= "$ew ";
			next;
		}
		print "Going to romanize $ew\n";
# 		`/afs/cs/project/avenue-1/Avenue/Urdu-MT/code/romanize/romanize.pl -i `
# 		$ew =~ s/__[BI]-PER$//;
		
		unless (defined $dict{$ew} or $ew !~ /[\x{0600}-\x{06FF}]/){
			$ew =~ s/(.)\x{0651}/$1$1/g; #gemination
			
			$ew =~ s/[\x{06CC}\x{06D2}]\x{0670}$/a/; #word end yeh + superscript alef
			$ew =~ s/^[\x{06CC}\x{06D2}]/y/; #word initial big and small yeh form a consonant
			
			$ew =~ s/\x{0627}\x{064B}/an/g; # alef+do zabar changes to -an
			$ew =~ s/^\x{0648}/v/; # word initial vao forms a consonant
			
			$ew =~ s/[\x{064A}\x{0649}]/\x{06CC}/g; # all the yeh are same as farsi yeh except badi yeh
# Apply rules in decreasing order of context length
			foreach my $rule (sort {length($b) <=> length($a)} keys %rules){
				$ew =~ s/$rule/$rules{$rule}/g;
#				last if($ew !~ /[\x{0600}-\x{0647}\x{0649}-\x{06FF}]/);
			}
			
			$ew =~ s/([bptjhdkg])[\x{0647}\x{06C1}]/$1_h/g; # Aspirated characters are formed only by dochashmi heh and not the other heh
			$ew =~ s/[\x{0647}\x{06c1}]/h/g; # Rest of the cases
			
			$ew =~ s/\x{0648}\x{0648}/vo/g; # special case of two vao together
			$ew =~ s/\x{0648}N$/oN/g; # At the end of the word, followed by nasalization, vao has to be a vowel.
			$ew =~ s/\x{06CC}\x{0648}/yo/g; # following yeh, it is a vowel
			$ew =~ s/([aeiou])\x{0648}/$1v/g; # otherwise following a vowel (starting a syllable), voa is a consonant
			$ew =~ s/\x{0648}/o/g; # else it is a vowel
			
			$ew =~ s/([aeiou])\x{06CC}/$1y/g; # farsi yeh is used for y and I, so after a vowel, it will be y.
			$ew =~ s/\x{06CC}o/yo/g; # yeh followed by vao that is realized as a vowel is a consonant
			$ew =~ s/\x{06CC}/i/g; # other places, let us hope it is I.
							
			$ew =~ s/[\x{0654}\x{0621}]//g; # hamza is unproductive in remaining cases
			$ew =~ s/^(\w)/uc($1)/e;
			$dict{$word} = $ew;
		}
		if(defined $dict{$word}){
			$rtext .= "$dict{$word} ";
			print "$rtext\n";
		}
		else{
			$rtext .= "$ew ";
		}
	}
	return $rtext;
}
}

sub convertNumeral{
	my $cword = shift;
	
	$cword =~ s/[\x{06F0}\x{0660}]/0/g;
	$cword =~ s/[\x{06F1}\x{0661}]/1/g;
	$cword =~ s/[\x{06F2}\x{0662}]/2/g;
	$cword =~ s/[\x{06F3}\x{0663}]/3/g;
	$cword =~ s/[\x{06F4}\x{0664}]/4/g;
	$cword =~ s/[\x{06F5}\x{0665}]/5/g;
	$cword =~ s/[\x{06F6}\x{0666}]/6/g;
	$cword =~ s/[\x{06F7}\x{0667}]/7/g;
	$cword =~ s/[\x{06F8}\x{0668}]/8/g;
	$cword =~ s/[\x{06F9}\x{0669}]/9/g;
	
	return $cword;
}
