#!/usr/local/bin/perl

## Enter a short description of the script

use strict;
use warnings;

binmode(STDOUT,":utf8");

my %posmap = (N => 'NOUN', RB => 'ADV',PRP => 'PRON',V => 'VERB',JJ => 'ADJ',CC => 'CONJ',CD => 'NUM',WH => 'WHQ', IN => 'POST', DET => 'DET', LEX => 'OTHER');

my %posmapP2L = (NOUN => 'N',OTHER => 'LEX',ADV => 'RB',PRON => 'PRP',VERB => 'V',FW => 'FW',ADJ => 'JJ',CONJ => 'CC',NUM => 'CD',WHQ => 'WH',POST => 'IN', DET => 'DET');

my $lexFile1 = shift;
# my $lexFile2 = shift;
my $posFile = shift;

my %lexicon = ();
loadLex($lexFile1,1);
# rewriteLex($lexFile1,1);
# loadLex($lexFile2,1000);

open(PFILE,"<:encoding(utf8)" ,$posFile) or die("Couldn't open pos dictionary\n");
my %posDict = ();
my %vDict = ();
while(<PFILE>){
	chomp;
	my @tokens = split /\s+/;
	push @{$posDict{$tokens[0]}}, @tokens[1..$#tokens];
	my $s = join(' ',@tokens[1..$#tokens]);
	$vDict{$tokens[0]} = 1 if($s =~ /(^| )VERB /);
# 	print "@tokens[1..$#tokens]\n";
}
close PFILE;

$lexFile1 =~ /dvd\-(.+?)\.lex$/;
my $mcount = 0;
foreach my $urd (keys %lexicon){
	if(defined $posDict{$urd}){
		foreach my $eng (keys %{$lexicon{$urd}}){
# 			print "Mutli pos tags $urd $eng @{${$lexicon{$urd}}{$eng}}\n" if($#{${$lexicon{$urd}}{$eng}} > 0);
			for(my $p = 0; $p <= $#{${$lexicon{$urd}}{$eng}}; $p++){
				my $ptag = ${${$lexicon{$urd}}{$eng}}[$p];
				if($ptag eq 'LEX'){
# 					if($#{${$lexicon{$urd}}{$eng}} > 0){
# # 						print "throwing away LEX\n";
# 						splice(@{${$lexicon{$urd}}{$eng}},$p,1);
# 					}
# 					else{
# # 						print "Possible pos tags for $urd $eng: ", join(' ',@{$posDict{$urd}}),"\n";
# 						if($#{$posDict{$urd}} == 2 and ${$posDict{$urd}}[0] ne 'OTHER' and ${$posDict{$urd}}[0] ne 'FW' and ${$posDict{$urd}}[2] > 1){
# 							next if(${$posDict{$urd}}[0] eq 'VERB' and $urd !~ /\x{0646}\x{0627}$/);
# 							${${$lexicon{$urd}}{$eng}}[0] = $posmapP2L{${$posDict{$urd}}[0]};
# 							print "Assigned $urd $eng: ${${$lexicon{$urd}}{$eng}}[0] ${$posDict{$urd}}[1]\n";
# 						}
# 					}
				}
				if($ptag eq 'V'){
					my $suff = verifyVerb($urd);
					$urd =~ /^(.+)$suff$/;
					my $root = $1;
					print "$urd $eng $root $suff\n";
				}
			}
# 			my @match = grep {$posmap{$ptag} eq $_} @{$posDict{$urd}};
# 			if($#match == -1){
# 				print "$ptag $urd not in pos ",join(' ',@{$posDict{$urd}}),"\n";
# 				$mcount += keys %{$lexicon{$key}};
# 			}
		}
	}
	else{
		foreach my $eng (keys %{$lexicon{$urd}}){
			print $urd,' ',$eng,' ',join(' ',@{${$lexicon{$urd}}{$eng}}),"\n";
		}
		my %pos = ();
		foreach (keys %{$lexicon{$urd}}){
# 			/^(.+?)\-(.+)$/;
# 			$pos{$posmap{$1}} = 1 ;
		}
# 		print "$urd ",keys %pos,"\n";
# 		print keys %{$dict{$urd}},"\n";
	}
}
print STDERR "Missing entries are $mcount\n";

BEGIN {
open(SFILE,"<:encoding(utf8)","suff.prob") or die("Couldn't load the suffix file\n");
my %vsuf = ();
$vsuf{''} = 1;
while(<SFILE>){
	chomp;
	(my $suff, my $score) = split /\s+/;
	next if($suff =~ /\[/ or $score < .001);
	$vsuf{$suff} = $score;
# 	print "$suff $score\n";
}
close SFILE;
my $ssize = keys %vsuf;
print "Loaded $ssize suffixes\n";
my %cache = ();
sub verifyVerb{
	my $v = shift;
	unless(defined $cache{$v}){
		my $max = 0;
		my $maxs = '';
		foreach my $vs (sort{$vsuf{$b} <=> $vsuf{$a}} keys %vsuf){
			next unless($v =~ /.+$vs$/);
			$v =~ /^(.+)$vs$/;
			my $vroot = $1;
			my $vscore = 0;
			foreach my $fs (sort{$vsuf{$b} <=> $vsuf{$a}} keys %vsuf){
				next if($fs eq $vs);
				$vscore++ if(defined($vDict{$vroot.$fs}));
			}
# 			print "$vroot $vs $vscore\n";
			if($vscore > $max){
				$max = $vscore;
				$maxs = $vs;
			}
		}
		$cache{$v} = $maxs;
	}
# 	print "$v $cache{$v}\n";
	return $cache{$v};
}
}

sub loadLex{

	my $fileName = shift;
	my $inc = shift;
	open(IFILE,"<:encoding(utf8)" ,$fileName) or die("Couldn't open the file $fileName\n");
	while(<IFILE>){
		chomp;
		unless(/^(.+)::(.+) \|: \[\"(.+)\"\] -> \[\"(.+)\"\]$/){
			print "Unmatched verb $_\n" if(/\|:/);
			next;
		}
# 		print "$1 $2 $3 $4\n";
		if($3 =~ /"/ || $4 =~ /"/){
# 			print "multi word Verb $3 $4\n";
			next;
		}
		push @{${$lexicon{$3}}{$4}}, $1 ;
		print "Duplicate\n $1 $2 $3 $4\n" if($#{${$lexicon{$3}}{$4}} > 0);
	}
	close IFILE;
# 	exit;
}

