#!/usr/local/bin/perl my %posmap = (NOUN => 'N',OTHER => 'LEX',ADV => 'RB',PRON => 'PRP',VERB => 'V',FW => 'FW',ADJ => 'JJ',CONJ => 'CC',NUM => 'CD',WHQ => 'WH',POST => 'IN', DET => 'DET'); # binmode(STDIN,":utf-8"); # my $probFile = shift; my $nnpFile = shift; my $mode = shift; # my %probs = (); # open(PFILE,$probFile) or die("Couldn't open the prob file !\n"); # # while(){ # chomp; # (my $urd, my $eng, my $prob) = split /\s+/; # $eng =~ s/[\[\]\",\.;\\:]//g; # $probs{"$eng-$urd"} += $prob; # $probs{"$eng-$urd"} = 1 if($probs{"$eng-$urd"} > 1 and $probs{"$eng-$urd"} < 1.0001); # # print("Prob greater than 1 for $eng $urd ".$probs{"$eng-$urd"}."\n") if($probs{"$eng-$urd"} > 1); # } # close PFILE; my %nnp = (); open(NNPFILE,$nnpFile) or die("Couldn't open the nnp file !\n"); while (){ # print STDERR $_; chomp; (my $urd, my $rom) = split /\s+/; $nnp{$urd} = $rom; } close NNPFILE; # binmode(STDIN,":utf8"); %cooccur_count = (); my %srcCount = (); while($line = ){ if($line !~ /__SENTENCE__ \S+/){ $line =~ s/[\[\]\",\.;\\:+]//g; # $line =~ s/[\x{FEFF}\x{202B}\x{202C}\x{202A}]//g; if($line =~ /(\S+)\s+(\S+)\s+(\S+)/){ next if($1 eq 'the'); $key = "${2}____${1}____${3}"; if(defined($cooccur_count{$key})){ $cooccur_count{$key} = $cooccur_count{$key} + 1; }else{ $cooccur_count{$key} = 1; $srcCount{$2} += 1; } $srcCount{$2} += 1; }else{ # die "bad format: $line"; } } } # foreach (sort keys %cooccur_count){ # print "$_ $cooccur_count{$_}\n"; # } my $keyCount = 1; # while(($key, $count) = each(%cooccur_count)){ foreach my $key (sort keys %cooccur_count){ my $count = $cooccur_count{$key}; if($key =~ /^(\S+)____(\S+)____(\S+)$/){ next if($1 =~ /^[\-'\?\!\/#@ ]*$/); next if($1 eq ""); if($mode eq 'giza'){ print "$1 $2\n"; next; } # my $probStr = "(*sgt* ".$probs{"$1-$2"}.")"; # next if($probStr eq '(*sgt* )'); # print "$1 $2 $3 $count\n"; unless(defined($posmap{$3})){ print STDERR "next\n"; next; } my $ruleScore = 0; if($3 eq 'NOUN' and defined($nnp{$1}) and computeSim($nnp{$1},$2)){ # print STDERR "accept $nnp{$2},$1\n"; # print STDERR "NNP\n"; my $name = ucfirst($2); $srcCount{$1} += $count; $ruleScore = $count/$srcCount{$1}; print "{giza-NNP,$keyCount}\n"; print "NNP::NNP |: [\"$1\"] -> [\"$name\"]\n"; print "(\n\t(*score* $ruleScore) ;$count $srcCount{$1}\n"; print "\t(*fullform*)\n"; print "\t(X1::Y1)\n\t((x0 lex) = $name)\n)\n\n"; $keyCount++; } $ruleScore = $count/$srcCount{$1}; print "{giza-$posmap{$3},$keyCount}\n"; print "$posmap{$3}::$posmap{$3} |: [\"$1\"] -> [\"$2\"]\n"; print "(\n\t(*score* $ruleScore) ;$count $srcCount{$1}\n"; print "\t(*fullform*)\n\t(X1::Y1)\n\t((x0 lex) = $1)\n)\n\n"; # \t$probStr\n $keyCount++; }else{ die "bad format in hash: $key."; } } sub computeSim{ my $u = shift; my $e = shift; $u =~ s/[aieou_]//g; my @uchar = split //,$u; my @echar = split //,$e; my @lcs = (); for(my $i = 0; $i <= $#uchar; $i++){ for(my $j = 0; $j <= $#echar; $j++){ my $match = 0; if($uchar[$i] eq $echar[$j]){ $match = 1; } my $p11 = 0; my $p01 = 0; my $p10 = 0; if($i == 0 and $j == 0){ ${$lcs{$i}}{$j} = $match; next; } elsif($i == 0){ $p11 = $match; $p01 = ${$lcs{$i}}{$j-1}; $p10 = 0; } elsif($j == 0){ $p11 = $match; $p01 = 0; $p10 = ${$lcs{$i-1}}{$j}; } else{ $p11 = ${$lcs{$i-1}}{$j-1} + $match; $p01 = ${$lcs{$i}}{$j-1}; $p10 = ${$lcs{$i-1}}{$j}; } ${$lcs{$i}}{$j} = max3($p11,$p01,$p10); } } # print STDERR "$u $e ${$lcs{$#uchar}}{$#echar} $#uchar @uchar\n" if($e eq 'yasin'); return 1 if(${$lcs{$#uchar}}{$#echar} == $#uchar+1 or ${$lcs{$#uchar}}{$#echar} == $#uchar); } sub max3{ my $a1 = shift; my $a2 = shift; my $a3 = shift; if($a1 > $a2){ return $a1 if($a1 > $a3); } else{ return $a2 if($a2 > $a3); } return $a3; }