#!/usr/local/bin/perl

my %posmap = (NOUN => 'N',OTHER => 'LEX',ADV => 'RB',PRON => 'PRP',VERB => 'V',FW => 'FW',ADJ => 'JJ',CONJ => 'CC',NUM => 'CD',WHQ => 'WH',POST => 'IN', DET => 'DET');

# binmode(STDIN,":utf-8");
# my $probFile = shift;
my $nnpFile = shift;
my $mode = shift;
# my %probs = ();
# open(PFILE,$probFile) or die("Couldn't open the prob file !\n");
# 
# while(<PFILE>){
# 	chomp;
# 	(my $urd, my $eng, my $prob) = split /\s+/;
# 	$eng =~ s/[\[\]\",\.;\\\(\):]//g;
# 	$probs{"$eng-$urd"} += $prob;
# 	$probs{"$eng-$urd"} = 1 if($probs{"$eng-$urd"} > 1 and $probs{"$eng-$urd"} < 1.0001);
# # 	print("Prob greater than 1 for $eng $urd ".$probs{"$eng-$urd"}."\n") if($probs{"$eng-$urd"} > 1);
# }
# close PFILE;

my %nnp = ();
open(NNPFILE,$nnpFile) or die("Couldn't open the nnp file !\n");
while (<NNPFILE>){
# 	print STDERR $_;
	chomp;
	(my $urd, my $rom) = split /\s+/;
	$nnp{$urd} = $rom;
}
close NNPFILE;

# binmode(STDIN,":utf8");
%cooccur_count = ();
my %srcCount = ();
while($line = <STDIN>){
    if($line !~ /__SENTENCE__ \S+/){
    	$line =~ s/[\[\]\",\.;\\\(\):+]//g;
#     	$line =~ s/[\x{FEFF}\x{202B}\x{202C}\x{202A}]//g;
	if($line =~ /(\S+)\s+(\S+)\s+(\S+)/){
		next if($1 eq 'the');
	    $key = "${2}____${1}____${3}";
	    if(defined($cooccur_count{$key})){
		$cooccur_count{$key} = $cooccur_count{$key} + 1;
	    }else{
		$cooccur_count{$key} = 1;
		$srcCount{$2} += 1;
	    }
	    $srcCount{$2} += 1;
	}else{
# 	    die "bad format: $line";
	}
    }
}

# foreach (sort keys %cooccur_count){
# 	print "$_ $cooccur_count{$_}\n";
# }

my $keyCount = 1;
# while(($key, $count) = each(%cooccur_count)){
foreach my $key (sort keys %cooccur_count){
    my $count = $cooccur_count{$key};
    if($key =~ /^(\S+)____(\S+)____(\S+)$/){
    	next if($1 =~ /^[\-'\?\!\/#@ ]*$/);
    	next if($1 eq "");
    	if($mode eq 'giza'){
		print "$1 $2\n";
		next;
	}
#     	my $probStr = "(*sgt* ".$probs{"$1-$2"}.")";
#     	next if($probStr eq '(*sgt* )');
# 	print "$1 $2 $3 $count\n";
	unless(defined($posmap{$3})){
		print STDERR "next\n";
		next;
	}
	my $ruleScore = 0;
	if($3 eq 'NOUN' and defined($nnp{$1}) and computeSim($nnp{$1},$2)){
# 		print STDERR "accept $nnp{$2},$1\n";
# 		print STDERR "NNP\n";
		my $name = ucfirst($2);
		$srcCount{$1} += $count;
		$ruleScore = $count/$srcCount{$1};
		print "{giza-NNP,$keyCount}\n";
		print "NNP::NNP |: [\"$1\"] -> [\"$name\"]\n";
		print "(\n\t(*score* $ruleScore) ;$count $srcCount{$1}\n";
		print "\t(*fullform*)\n";
		print "\t(X1::Y1)\n\t((x0 lex) = $name)\n)\n\n";
		$keyCount++;
	}
	$ruleScore = $count/$srcCount{$1};
	print "{giza-$posmap{$3},$keyCount}\n";
	print "$posmap{$3}::$posmap{$3} |: [\"$1\"] -> [\"$2\"]\n";
	print "(\n\t(*score* $ruleScore) ;$count $srcCount{$1}\n";
	print "\t(*fullform*)\n\t(X1::Y1)\n\t((x0 lex) = $1)\n)\n\n"; # \t$probStr\n
	$keyCount++;

    }else{
	die "bad format in hash: $key.";
    }
}

sub computeSim{
	my $u = shift;
	my $e = shift;
	
	$u =~ s/[aieou_]//g;
	
	my @uchar = split //,$u;
	my @echar = split //,$e;
	
	my @lcs = ();
	for(my $i = 0; $i <= $#uchar; $i++){
		for(my $j = 0; $j <= $#echar; $j++){
			my $match = 0;
			if($uchar[$i] eq $echar[$j]){
				$match = 1;
			}
			my $p11 = 0;
			my $p01 = 0;
			my $p10 = 0;

			if($i == 0 and $j == 0){
				${$lcs{$i}}{$j} = $match;
				next;
			}
			elsif($i == 0){
				$p11 = $match;
				$p01 = ${$lcs{$i}}{$j-1};
				$p10 = 0;
			}
			elsif($j == 0){
				$p11 = $match;
				$p01 = 0;
				$p10 = ${$lcs{$i-1}}{$j};
			}
			else{
				$p11 = ${$lcs{$i-1}}{$j-1} + $match;
				$p01 = ${$lcs{$i}}{$j-1};
				$p10 = ${$lcs{$i-1}}{$j};
			}
			${$lcs{$i}}{$j} = max3($p11,$p01,$p10);
		}
	}
# 	print STDERR "$u $e ${$lcs{$#uchar}}{$#echar} $#uchar @uchar\n" if($e eq 'yasin');
	return 1 if(${$lcs{$#uchar}}{$#echar} == $#uchar+1 or ${$lcs{$#uchar}}{$#echar} == $#uchar);
}

sub max3{
	my $a1 = shift;
	my $a2 = shift;
	my $a3 = shift;
	
	if($a1 > $a2){
		return $a1 if($a1 > $a3);
	}
	else{
		return $a2 if($a2 > $a3);
	}
	return $a3;
}