#!/usr/local/bin/perl -w -- -*- coding: utf-8 -*- use strict; use warnings; use IO::Socket::INET; use Net::hostent; use FileHandle; use File::Temp "tempfile"; use File::Path; use POSIX ":sys_wait_h"; # $morphdir = "/shared/code/Hindi/hindi_chunker/morph"; # $converterdir = "/shared/code/Hindi/Converters"; my $serverport = 3125; $serverport = $ARGV[0] if ($ARGV[0] =~ m/^\d+$/); my %morphcache = (); my $cachecount = 0; my $usecache = 1; # Load the morph cache if ($usecache == 1) { open(MC, "morphcache.txt") or warn $!; $cachecount = 0; while (!eof(MC)) { $cachecount++; my $word = ; $word =~ s/[\n\r]*$//; my $morphfs = ; $morphfs =~ s/[\n\r]*$//; $morphcache{$word} = $morphfs; } close(MC); } $| = 1; my $server = IO::Socket::INET->new(LocalPort => $serverport, Type => SOCK_STREAM, Reuse => 1, Listen => 12) or die "Couldn't be a tcp server on port $serverport: $!\n"; print "Starting Hindi morphology server on port $serverport; $cachecount words cached\n\n"; #$SIG{CHLD} = 'IGNORE'; $SIG{CHLD} = sub { wait }; while (my $client = $server->accept()) { my $kidpid = fork; if ($kidpid) { close $client; next; } defined($kidpid) or die "cannot fork: $!"; close $server; my $hostinfo = gethostbyaddr($client->peeraddr); printf "[Connect from %s]\n", $hostinfo->name || $client->peerhost; if ($hostinfo->name !~ m/cmu.edu$/i && $hostinfo->name !~ m/^localhost$/i) { print $client "[Not from CMU. Closing out connection]\n\n"; print "[Not from CMU. Closing out connection]\n\n"; close($client); exit; } $morphcache{"।"} = "()"; while (my $sent = <$client>) { print $sent,"\n"; if ($sent eq "**EXIT**") { last; } $sent =~ s/"/'/g; $sent = normalizeText($sent); my $neSent = getNETags($sent); print "Shouting that ne sent is empty |$neSent| |$sent|\n" if($neSent eq ''); my @words = split /\s+/,$sent; my @neWords = split /\s+/, $neSent; my $fss = ""; print "Morphcache has ",keys %morphcache," keys\n"; for(my $i = 0; $i <= $#words; $i++){ my $word = $words[$i]; my $neword = $neWords[$i]; my $spanstart = $i; my $spanend = $i+1; # Don't bother with empty lines if ($word =~ m/^\s*$/) { print STDERR "Something wrong ! |$word $i|\n"; $morphcache{$word} = "()"; $fss .= "( (spanstart $spanstart) (spanend $spanend) ) "; next; } print "\n* Processing \"$word\", length " . length($word) . "\n"; # Keep a cache of results of previously seen words to speed things up if (defined($morphcache{$word})) { print "Found in cache: " . $morphcache{$word} . "\n"; $fss .= "( (spanstart $spanstart) (spanend $spanend) $morphcache{$word} ) "; next; } # Handle punctuations if ($word =~ m/^[,\.;\?\!\-:"']+$/) { # $word = '\"' if($word eq '"'); $morphcache{$word} = "(pos PUNCT) (lex \"$word\")"; $fss .= "( (spanstart $spanstart) (spanend $spanend) (pos PUNCT) (lex \"$word\") (trans \"$word\") ) "; print $fss,"\n"; next; } # Convert numerals if ($word =~ m/^[١٢٣٤٥٦٧٨٩٠0123456789۰۱۲۳۴۵۶۷۸۹\.,]+$/) { my $cword = convertNumeral($word); $morphcache{$word} = "(pos CD) (lex \"$word\") (trans \"$cword\")"; $fss .= "( (spanstart $spanstart) (spanend $spanend) (pos CD) (lex \"$word\") (trans \"$cword\") ) "; next; } if($neword =~ /__[BI]-(PER|ORG)$/){ $neword =~ s/__[BI]-(PER|ORG)$//; # $morphcache{$word} = "(pos NNP) (lex \"$neword\") (trans \"$neword\")"; print "( (spanstart $spanstart) (spanend $spanend) (pos NNP) (lex \"$word\") (trans \"$neword\") )"; $fss .= "( (spanstart $spanstart) (spanend $spanend) (pos NNP) (lex \"$neword\") ) "; next; # (trans \"$neword\") } else{ my $posRef = getPOS($word); # $morphcache{$word} = "(pos LEX) (lex \"$word\")"; # $fss .= "( "; print $word,"\n"; foreach my $pos (@{$posRef}){ if($pos eq 'V'){ my $root = getVerbRoot($word); print "( (spanstart $spanstart) (spanend $spanend) (lex \"$root\") (pos $pos) )"; $fss .= "( (spanstart $spanstart) (spanend $spanend) (lex \"$root\") (pos $pos) )"; next; } print "( (spanstart $spanstart) (spanend $spanend) (lex \"$word\") (pos $pos) )"; $fss .= "( (spanstart $spanstart) (spanend $spanend) (lex \"$word\") (pos $pos) )"; } # $fss .= ' ) '; } } $fss = lc($fss); # print $fss,"\n"; print $client $fss,"\n"; } print "\n[Closing out connection]\n\n"; close($client); exit; } close($server); BEGIN{ sub getVerbRoot{ my $word = shift; my $root = $word; # $root = $1.'نا' if($word =~ /^(.+)نے$/); # $root = $1.'نا' if($word =~ /^(.+)یا$/); return $root; } } BEGIN{ my %dict = (); my %posmap = (NOUN => 'N',OTHER => 'LEX',ADV => 'RB',PRON => 'PRP',VERB => 'V',FW => 'FW',ADJ => 'JJ',CONJ => 'CC',NUM => 'CD',WHQ => 'WH',POST => 'IN', DET => 'DET'); # open(PFILE,"){ chomp; $_ =~ s/[,\.]//g; my @tokens = split / \| /; my @ptags = split /\s+/,$tokens[1]; # print $tokens[0],"\n",@posmap{@ptags},"\n"; push @{$dict{$tokens[0]}}, @posmap{@ptags}; foreach my $tag (@ptags){ $tags{$tag} = 1; } } close PFILE; open(PFILE,"){ chomp; my @tokens = split /\s+/; @{$dict{$tokens[0]}} = ('IN'); } close PFILE; open(PFILE,"){ chomp; my @tokens = split /\s+/; @{$dict{$tokens[0]}} = ('CC'); } close PFILE; open(PFILE,"){ chomp; my @tokens = split /\s+/; @{$dict{$tokens[0]}} = ('DET'); } close PFILE; # foreach (keys %tags){ # print $_,"\n"; # } sub getPOS{ my $word = shift; # print $word,"\n"; unless(defined $dict{$word}){ my @arr = ('LEX'); return \@arr; } # print $word,"\n"; return \@{$dict{$word}}; } } sub normalizeText{ my $sent = shift; # print $sent,"---\n"; chomp($sent); $sent =~ s/[\r\n]*//; # Convert the punctuation $sent =~ s/\x{060C}/,/g; $sent =~ s/\x{061F}/?/g; $sent =~ s/\x{061B}/;/g; $sent =~ s/\x{06D4}/./g; $sent =~ s/\x{066A}/%/g; $sent =~ s/\x{066B}/./g; $sent =~ s/\x{066C}/,/g; # $sent =~ s/([\-\)$\?\|\.\'\,\"])/ $1 /g; $sent =~ s/([$\(\?\|\"\!;])/ $1 /g; $sent =~ s/(\d)(\-)/$1 $2 /g; $sent =~ s/(\w)('s\s)/$1 $2/g; $sent =~ s/(\w)(')\s/$1 $2 /g; $sent =~ s/(^|\s)(')(\w)/$1 $2 $3/g; # s/(\d+) , (000)/$1$2/g; # Undo earlier mistake in normalization $sent =~ s/([^0-9])([,\.])/$1 $2 /g; $sent =~ s/([,\.])([^0-9])/ $1 $2/g; $sent =~ s/^\s+//; $sent =~ s/\s+$//; # print "|$sent|\n"; return $sent; } BEGIN { my $set = 'tune'; my %necache = (); my $neFile = "necache.$set"; if(-e $neFile){ print "Reading ne cache\n"; open(CFILE,$neFile) or die("Couldn't read the cache file\n"); while(){ chomp; my $file = ; chomp($file); $necache{$_} = $file if(-e $file); } close CFILE; } mkpath("/tmp/$set") unless(-d "/tmp/$set"); sub getNETags{ my $sent = shift; unless (defined($necache{$sent})){ (my $tmpfd, my $tmpName) = tempfile("neXXXXXXXXX",DIR => "/tmp/$set",SUFFIX => '.txt', UNLINK => '1'); print $tmpfd $sent; close $tmpfd; my $outFile = $tmpName.'.out'; $necache{$sent} = $outFile; open(CFILE,">>$neFile") or die("Couldn't open the necache file\n"); print CFILE "$sent\n$outFile\n"; close(CFILE); print "Executing /afs/cs/project/avenue-1/Avenue/Urdu-MT/code/ne-tagger/run_ner.sh < $tmpName > $outFile"; `/afs/cs/project/avenue-1/Avenue/Urdu-MT/code/ne-tagger/run_ner.sh < $tmpName > $outFile 2> /dev/null`; } open(IN,$necache{$sent}) or die("Unable to open temp file\n"); my $rsent = ""; while(){ print $_; chomp(); $rsent = $_;#romanizeNames($_); } close IN; return $rsent; } END{ print "Here\n"; if(%necache){ open(CFILE,">$neFile") or die("Couldn't open the necache file\n"); foreach my $sent (keys %necache){ print CFILE "$sent\n$necache{$sent}\n"; # print "$sent\n$necache{$sent}\n"; } close(CFILE); } } } BEGIN { my $path = "/afs/cs/project/avenue-1/Avenue/Urdu-MT/code/romanize"; my $ruleFile = "$path/romanization-mappings.txt"; my %rules = (); open(RFILE, "<:encoding(utf-8)",$ruleFile) or die("Couldn't open the file $ruleFile\n"); while(){ chomp; (my $urd, my $eng) = split /\-/; $urd =~ s/_0x/}\\x{/g; $urd =~ s/^0x/\\x{/; $urd .= '}'; $rules{$urd} = $eng; } close RFILE; # print keys %rules,"\n"; my %dict = (); sub romanizeNames{ my $line = shift; my $rtext = ""; $line =~ s/(\x{06D4})/ $1 /g; # seperate out urdu sentence marker my @words = split /\s+/,$line; foreach my $word (@words){ my $ew = $word; unless($ew =~ /-PER$/){ # For now throw away all the tags other than person names $ew =~ s/__[BI]-(.+)$//; $rtext .= "$ew "; next; } print "Going to romanize $ew\n"; # `/afs/cs/project/avenue-1/Avenue/Urdu-MT/code/romanize/romanize.pl -i ` # $ew =~ s/__[BI]-PER$//; unless (defined $dict{$ew} or $ew !~ /[\x{0600}-\x{06FF}]/){ $ew =~ s/(.)\x{0651}/$1$1/g; #gemination $ew =~ s/[\x{06CC}\x{06D2}]\x{0670}$/a/; #word end yeh + superscript alef $ew =~ s/^[\x{06CC}\x{06D2}]/y/; #word initial big and small yeh form a consonant $ew =~ s/\x{0627}\x{064B}/an/g; # alef+do zabar changes to -an $ew =~ s/^\x{0648}/v/; # word initial vao forms a consonant $ew =~ s/[\x{064A}\x{0649}]/\x{06CC}/g; # all the yeh are same as farsi yeh except badi yeh # Apply rules in decreasing order of context length foreach my $rule (sort {length($b) <=> length($a)} keys %rules){ $ew =~ s/$rule/$rules{$rule}/g; # last if($ew !~ /[\x{0600}-\x{0647}\x{0649}-\x{06FF}]/); } $ew =~ s/([bptjhdkg])[\x{0647}\x{06C1}]/$1_h/g; # Aspirated characters are formed only by dochashmi heh and not the other heh $ew =~ s/[\x{0647}\x{06c1}]/h/g; # Rest of the cases $ew =~ s/\x{0648}\x{0648}/vo/g; # special case of two vao together $ew =~ s/\x{0648}N$/oN/g; # At the end of the word, followed by nasalization, vao has to be a vowel. $ew =~ s/\x{06CC}\x{0648}/yo/g; # following yeh, it is a vowel $ew =~ s/([aeiou])\x{0648}/$1v/g; # otherwise following a vowel (starting a syllable), voa is a consonant $ew =~ s/\x{0648}/o/g; # else it is a vowel $ew =~ s/([aeiou])\x{06CC}/$1y/g; # farsi yeh is used for y and I, so after a vowel, it will be y. $ew =~ s/\x{06CC}o/yo/g; # yeh followed by vao that is realized as a vowel is a consonant $ew =~ s/\x{06CC}/i/g; # other places, let us hope it is I. $ew =~ s/[\x{0654}\x{0621}]//g; # hamza is unproductive in remaining cases $ew =~ s/^(\w)/uc($1)/e; $dict{$word} = $ew; } if(defined $dict{$word}){ $rtext .= "$dict{$word} "; print "$rtext\n"; } else{ $rtext .= "$ew "; } } return $rtext; } } sub convertNumeral{ my $cword = shift; $cword =~ s/[\x{06F0}\x{0660}]/0/g; $cword =~ s/[\x{06F1}\x{0661}]/1/g; $cword =~ s/[\x{06F2}\x{0662}]/2/g; $cword =~ s/[\x{06F3}\x{0663}]/3/g; $cword =~ s/[\x{06F4}\x{0664}]/4/g; $cword =~ s/[\x{06F5}\x{0665}]/5/g; $cword =~ s/[\x{06F6}\x{0666}]/6/g; $cword =~ s/[\x{06F7}\x{0667}]/7/g; $cword =~ s/[\x{06F8}\x{0668}]/8/g; $cword =~ s/[\x{06F9}\x{0669}]/9/g; return $cword; }