#!/usr/local/bin/perl -w -- -*- coding: utf-8 -*- use IO::Socket::INET; use Net::hostent; use FileHandle; use Expect; use POSIX ":sys_wait_h"; $morphdir = "/shared/code/Hindi/hindi_chunker/morph"; $converterdir = "/shared/code/Hindi/Converters"; if (scalar(@ARGV) == 1 and $ARGV[0] =~ m/^\d+$/) { $serverport = $ARGV[0]; } else { $serverport = 3125; #$serverport = 3126; } %morphcache = (); $cachecount = 0; $usecache = 1; # Load the morph cache if ($usecache == 1) { open(MC, "morphcache.txt") or warn $!; $cachecount = 0; while (!eof(MC)) { $cachecount++; $word = ; $word =~ s/[\n\r]*$//; $morphfs = ; $morphfs =~ s/[\n\r]*$//; $morphcache{$word} = $morphfs; } close(MC); } %tagsetmap = ("v", "V", "v_a", "Aux", "Aux", "Aux", "RAHA", "RAHA", "v_n", "V_n", # Verbal noun "n", "N", "P", "Pron", "p", "Postp", "adj", "Adj", "Avy", "Poss", "D", "Adv", "sh_n", "Poss", # Shasti Noun "sh_P", "PossPron" # Shasti Pronoun ); %personmap = ("u", "1", "m", "2", "m_h", "2", "a", "3", "any", "(*OR* 1 2 3)" ); %casemap = ("0", "dir", "1", "obl", "any", "(*OR* dir obl)" ); %gendermap = ("m", "m", "f", "f", "any", "(*OR* m f)" ); %numbermap = ("s", "s", "p", "p", "any", "(*OR* s p)" ); %tammap = ("*yA*", "(aspect perf) (form part) ", "*yA1*", "(aspect perf) (form part) ", "*wA*", "(aspect imperf) (form part) ", "*nA*", "(form inf) ", "*future*", "(tense fut) (form fin) ", "*subj*", "(tense subj) (form fin) ", "*kara*", "(form prog) ", "*0*", "(form root) ", "*WA*", "(tense past) (form fin) ", "*hE*", "(tense pres) (form fin) ", "*Shashthi*", "(tam *Shashthi*) ", "*ne*", "(tam *ne*) ", "*se*", "(tam *se*) ", "*meM*", "(tam *meM*) ", "*ko*", "(tam *ko*) " ); %spellcorrect = ("hEMH\n", "hEM\n"); # %numbers = ("१", "1", # "२", "2", # "३", "3", # "४", "4", # "५", "5", # "६", "6", # "७", "7", # "८", "8", # "९", "9", # "०", "0" # ); sub getSpelling { my($orig) = shift; my($in, $variations); $orig =~ s/\n$//; $in = $orig; $in =~ s/([EeoOiI])M/$1z/g; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/([aeiouAEIOU])([eiI])$/$1y$2/g; # y insertion as in gae/gaye if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/M([kKgG])/f$1/g; $in =~ s/M([cCjJ])/F$1/g; $in =~ s/M([tTdD])/N$1/g; $in =~ s/M([wWxX])/n$1/g; $in =~ s/M([pPbB])/m$1/g; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/f([kKgG])/M$1/g; $in =~ s/F([cCjJ])/M$1/g; $in =~ s/N([tTdD])/M$1/g; $in =~ s/n([wWxX])/M$1/g; $in =~ s/m([pPbB])/M$1/g; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/M/z/g; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/$/H/g; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/([kctwpKCTWPgjdxbGJDXBfFNnmyrlvsSRh])a$/$1/g; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/iyAM$/iyAz/; if ($in ne $orig) { $variations .= "\t$in"; } $in = $orig; $in =~ s/Q/q/g; if ($in ne $orig) { $variations .= "\t$in"; } $variations =~ s/^\t+//; return $variations; } sub ucs2utf8 { my($ucsstring) = @_; my($utf8str, $ucschar, $i, $bin1, $bin2, $bin3, $retval); my($byteorder) = "le"; $utf8str = ""; # Check for byte order mark $possbom = substr($ucsstring, 0, 2); $binchar = vec($possbom, 0, 16); my($start) = 0; if ($binchar == 0xfeff) { $byteorder = "le"; $start = 2; } elsif ($binchar == 0xfffe) { $byteorder = "be"; $start = 2; } #print "Byte order is $byteorder\n"; for ($i = $start; $i < length($ucsstring); $i+=2) { $ucschar = substr($ucsstring, $i, 2); # might need to check big/little endian if ($byteorder eq "be") { $ucschar = substr($ucschar, 1, 1) . substr($ucschar, 0, 1); } $binchar = vec($ucschar, 0, 16); if ($binchar <= 127) { $retval = pack("C", $binchar); } elsif ($binchar <= 2047) { $bin1 = ($binchar >> 6) | 0xC0; $bin2 = ($binchar & 0x3F) | 0x80; $retval = pack("C2", $bin1, $bin2); } else { $bin1 = ($binchar >> 12) | 0xE0; $bin2 = (($binchar & 0x0FFF) >> 6) | 0x80; $bin3 = ($binchar & 0x003F) | 0x80; $retval = pack("C*", $bin1, $bin2, $bin3); # #print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n"; } $utf8str .= $retval; } return $utf8str; } sub utf82ucs { my($utfstring) = @_; my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3); $i = 0; while ($i < length($utfstring)) { $byte1 = substr($utfstring, $i, 1); if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII) $unichar = pack("C", 0x00) . $byte1; $i++; } elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long $byte2 = substr($utfstring, $i+1, 1); $int1 = unpack("C", $byte1) & 0x1F; $int1 <<= 0x06; $int2 = unpack("C", $byte2) & 0x3F; $unival = $int1 | $int2; $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival)); $i += 2; } else { # 3 bytes long $byte2 = substr($utfstring, $i+1, 1); $byte3 = substr($utfstring, $i+2, 1); $int1 = 0x0F & unpack("C", $byte1); $int1 <<= 12; $int2 = 0x3F & unpack("C", $byte2); $int2 <<= 6; $int3 = 0x3F & unpack("C", $byte3); $unival = $int1 | $int2 | $int3; $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival)); $i += 3; } $unistring .= $unichar; } $unistring; } sub initMorph { # Start all the encoding converters ($morph = Expect->spawn("$morphdir/anusAraka/hindi/morph/test/morph.out $morphdir/anusAraka/hindi/morph/test $morphdir/anusAraka/hindi/dict/dict_final $morphdir/anusAraka/hindi/test ULDWHF")) || die "Couldn't spawn morph, $!"; $morph->exp_stty('raw -echo'); ($uc2i8 = Expect->spawn("./uni2iscii.pl")) || die "Couldn't spawn ucs to iscii8, $!"; $uc2i8->exp_stty('raw -echo'); ($i82wx = Expect->spawn("$converterdir/i8_wx/d8_ra_wp_r.out")) || die "Couldn't spawn i8 to wx, $!"; $i82wx->exp_stty('raw -echo'); ($wx2i8 = Expect->spawn("$converterdir/wx_i8/ra_d8_wp_r.out")) || die "Couldn't spawn wx to i8, $!"; $wx2i8->exp_stty('raw -echo'); ($i82uc = Expect->spawn("$converterdir/i8_uc/iscii-to-unicode-ver2.out big_endian")) || die "Couldn't spawn i8 to utf16, $!"; $i82uc->exp_stty('raw -echo'); } $Expect::Log_Stdout = 0; $Expect::Multiline_Matching = 0; $| = 1; $server = IO::Socket::INET->new(LocalPort => $serverport, Type => SOCK_STREAM, Reuse => 1, Listen => 12) or die "Couldn't be a tcp server on port $serverport: $!\n"; print "Starting Hindi morphology server on port $serverport; $cachecount words cached\n\n"; #$SIG{CHLD} = 'IGNORE'; $SIG{CHLD} = sub { wait }; $morphinited = 0; while ($client = $server->accept()) { if ($kidpid = fork) { close $client; next; } defined($kidpid) or die "cannot fork: $!"; close $server; $hostinfo = gethostbyaddr($client->peeraddr); printf "[Connect from %s]\n", $hostinfo->name || $client->peerhost; if ($hostinfo->name !~ m/cmu.edu$/i && $hostinfo->name !~ m/^localhost$/i) { print $client "[Not from CMU. Closing out connection]\n\n"; print "[Not from CMU. Closing out connection]\n\n"; close($client); exit; } $morphcache{"।"} = "()"; while ($word = <$client>) { $word =~ s/[ \-\)\?\|\.\'\,\r\n]*$//; $word =~ s/^[\-\(\`\']*//; $word =~ s/^‘//; $word =~ s/’$//; if ($word eq "**EXIT**") { last; } # Don't bother with empty lines if ($word =~ m/^\s*$/) { $morphcache{$word} = "()"; print $client "()\n"; next; } print "\n* Processing \"$word\", length " . length($word) . "\n"; # Keep a cache of results of previously seen words to speed things up if (defined($morphcache{$word})) { print "Found in cache: " . $morphcache{$word} . "\n"; print $client $morphcache{$word} . "\n"; next; } # Don't bother with numbers if ($word =~ m/१|२|३|४|५|६|७|८|९|०/) { $morphcache{$word} = "((pos lex) (lex \"$word\"))"; print $client "((pos lex) (lex \"$word\"))\n"; next; } if ($morphinited == 0) { &initMorph(); $morphinited = 1; } $wxinput = 0; # If all alphabetic, assume it's in WX Roman, otherwise, assume UTF8 and convert to WX if ($word =~ m/^[a-zA-Z]+$/) { $wxword = $word . "\n"; $wxinput = 1; } else { # Check if in UTF-8 (or could be in UTF-8) $isutf8 = 1; for ($byteindex = 0; $byteindex < length($word); $byteindex++) { if (vec($word, $byteindex, 8) < 128) { $isutf8 = 0; } } if ($isutf8 == 0) { $morphcache{$word} = "((pos lex) (lex \"$word\"))"; print $client "((pos lex) (lex \"$word\"))\n"; next; } $wxinput = 0; $ucsword = utf82ucs("$word\n"); #print "UTF16 output $ucsword"; $uc2i8->clear_accum(); $uc2i8->send("$ucsword"); $uc2i8->expect(1, '-re', '^[^\n]*\r?\n'); $i8word = $uc2i8->exp_match(); #print "ISCII output $i8word"; $i82wx->clear_accum(); $i82wx->send($i8word); $i82wx->expect(1, '-re', '^[^\n]+\r?\n'); if (defined($i82wx->exp_match())) { $wxword = $i82wx->exp_match(); #print "WX is ", $i82wx->exp_match(); } else { $morphcache{$word} = "((pos lex) (lex \"$word\"))"; print $client "((pos lex) (lex \"$word\"))\n"; next; } } # Fix some common spelling concerns $wxword =~ s/Z//g; $wxword =~ s/\?//g; if ($wxword =~ m/^\s*$/) { $morphcache{$word} = "()"; print $client "()\n"; next; } if (defined($spellcorrect{$wxword})) { $wxword = $spellcorrect{$wxword}; } print "WX-Roman $wxword\n"; $morph->clear_accum(); $morph->send("$wxword"); $morph->expect(1, '-re', '^[^\n]*\r?\n'); #, '-re', 'ROOT.*/', '^\r?$'); `rm -f uword`; if (defined($morph->exp_match())) { $features = $morph->exp_match(); if ($features =~ m/^\s*$/) { # Erik's spelling correction port $variantlist = &getSpelling($wxword); @variants = split(/\t/, $variantlist); foreach $variant (@variants) { print "Trying spelling $variant\n"; $morph->clear_accum(); $features = ""; $morph->send("$variant\n"); $morph->expect(1, '-re', '^[^\n\r]*\r?\n'); `rm -f uword`; $features = $morph->exp_match(); if ($features !~ m/^\s*$/) { last; } } if ($features =~ m/^\s*$/) { print "No spelling match\n"; $morphcache{$word} = "((pos lex) (lex \"$word\"))"; print $client "((pos lex) (lex \"$word\"))\n"; next; } } print "Morph output is $features"; # Process to f-struct if ($features =~ m/^\s*$/) { $morphcache{$word} = "((pos lex) (lex \"$word\"))"; print $client "((pos lex) (lex \"$word\"))\n"; next; } else { (@fstrucs) = ($features =~ m/([^\/]+\/)/g); #print "Fstruc size is ", scalar(@fstrucs); $output = ""; foreach $fstruc (@fstrucs) { $fstruc =~ m/_([^\{]+)\{/; $root = $1; if ($fstruc =~ m/CAT:v/ && $fstruc =~ m/TAM:\*yA\*/ && $root eq "raha") { #$root = "nA"; $fstruc =~ s/CAT:v/CAT:RAHA/; } elsif ($fstruc =~ m/CAT:v/ && $root eq "hE") { $root = "honA"; $fstruc =~ s/CAT:v/CAT:Aux/; } elsif ($fstruc =~ m/CAT:v/) { $root .= "nA"; } $fstruc =~ m/\{([^\}]+)\}/; (@fvpairs) = split(/\s+/, $1); $output .= "( "; #$wxinput = 0; # used for testing # if ($wxinput) { # If in it came in in WX, send it back out in WX # $output .= "(lex $root) "; # } else { # Convert to ISCII8 $wx2i8->clear_accum(); $i8root = ""; #print "Converting WX $root to UTF8\n"; $wx2i8->send($root . "\n"); $wx2i8->expect(1, '-re', '^[^\n]*\r?\n'); $i8root = $wx2i8->exp_match(); #print "ISCII8 output $i8root\n"; # Convert to UTF16 $i82uc->clear_accum(); $ucroot = ""; $i82uc->send($i8root); $i82uc->expect(1, '-re', '^[^\n]*\r?\n'); $ucroot = $i82uc->exp_match(); #print "UTF16 output $ucroot"; # Convert root to UTF-8 $u8root = ucs2utf8($ucroot); #print "UTF8 output $u8root"; chomp($u8root); $output .= "(lex $u8root) (lexwx $root) "; # } foreach $fvpair (@fvpairs) { ($feat, $val) = split(/:/, $fvpair); if ($feat eq "CAT") { $feat = "pos"; $val = $tagsetmap{$val}; } elsif ($feat eq "number") { $val = $numbermap{$val}; $output .= "(agr ((num $val))) "; next; } elsif ($feat eq "gender") { $val = $gendermap{$val}; $output .= "(agr ((gen $val))) "; next; } elsif ($feat eq "person") { $val = $personmap{$val}; $output .= "(agr ((pers $val))) "; next; } elsif ($feat eq "case") { $val = $casemap{$val}; } elsif ($feat eq "TAM") { $output .= $tammap{$val}; next; } elsif ($feat eq "PDGM") { next; } $output .= "(\L$feat\E $val) "; } $output .= ") "; } print "Returning $output\n"; print $client "$output\n"; $morphcache{$word} = $output; } } else { print "Nothing from morphology\n"; $morphcache{$word} = "((pos lex) (lex \"$word\"))"; print $client "((pos lex) (lex \"$word\"))\n"; } } if ($morphinited == 1) { $morph->hard_close(); $uc2i8->hard_close(); $i82wx->hard_close(); $wx2i8->hard_close(); $i82uc->hard_close(); } print "\n[Closing out connection]\n\n"; close($client); exit; } close($server);