#!/usr/local/bin/perl

## Collect a verb morphology lexicon

use strict;
use warnings;

binmode(STDOUT,":utf8");

open(SFILE,"<:encoding(utf8)","suff.prob") or die("Couldn't load the suffix file\n");
my %vsuf = ();
while(<SFILE>){
	chomp;
	(my $suff, my $score) = split /\s+/;
	$vsuf{$suff} = $score;
}
close SFILE;

foreach my $vs (sort{$vsuf{$b} <=> $vsuf{$a}} keys %vsuf){
	print "$vs $vsuf{$vs}\n";
}

exit;
my $fileName = shift;

open(IFILE,"<:encoding(utf8)" ,$fileName) or die("Couldn't open the file $fileName\n");
my $count = 0;
my %all = ();
my %vtokens = ();
while(<IFILE>){
	chomp;
	my @tokens = split /\s+/;
	next if($tokens[0] =~ /[A-Za-z0-9]/);
	$all{$tokens[0]} = 1;
	next unless(/\s+VERB\s*/);
	$count++;
	$vtokens{$tokens[0]} = 1;
}
close IFILE;
print STDERR "$count\n";

my $numRoot = 0;
my $numSuffix = 0;

my %suffixes = ();
$suffixes{'\x{0646}\x{0627}'} = 1;

my %roots = ();
foreach my $v (keys %vtokens){
	foreach my $s (keys %suffixes){
		next unless($v =~ /^(.+)$s$/);
		$roots{$1}++;
	}
}
$numRoot = keys %roots;
print STDERR "Collected $numRoot roots..\n";

%suffixes = ();
for(my $l = 0; $l < 2; $l++){
	my $loopc = 0;
	my %suffCount = ();
	foreach my $v (keys %vtokens){
		$loopc++;print STDERR $loopc,"\n" if($loopc % 1000 == 0);
# 		last if($loopc == 5000);
		my %suff = ();
		my $total = 0;
		foreach my $r (keys %roots){
			next unless($v =~ /^$r.+/);
			$v =~ /^$r(.+)$/;
			$suffixes{$1} = 1 unless(defined $suffixes{$1});
			$suff{$1} = $suffixes{$1} ;
			$total += $suff{$1};
# 		last;
		}
		foreach (keys %suff){
			$suff{$_} = $suff{$_}/$total;
			$suffCount{$_} += $suff{$_};
		}
	}
# $numSuffix = keys %suffixes;
# print STDERR "Collected $numSuffix suffixes..\n";

# foreach my $s (keys %suffixes){
# 	delete $suffixes{$s} if($suffixes{$s} < 5);
# }
# $numSuffix = keys %suffixes;
# print STDERR "Left with $numSuffix suffixes..\n";
	my $total = 0;
	foreach my $s (keys %suffCount){
		$total += $suffCount{$s};
	}
	foreach my $s (keys %suffCount){
		$suffCount{$s} = $suffCount{$s}/$total;
	}
	%suffixes = %suffCount;
}

foreach my $s (keys %suffixes){
	print "$s $suffixes{$s}\n";
}
# %roots = ();
# $loopc = 0;
# foreach my $a (keys %vtokens){
# 	$loopc++;print STDERR $loopc,"\n" if($loopc % 2000 == 0);
# 	foreach my $s (keys %suffixes){
# 		next unless($a =~ /.+$s$/);
# 		$a =~ /^(.+)$s$/;
# 		$roots{$1}++;
# # 		last;
# 	}
# }
# $numRoot = keys %roots;
# print STDERR "Collected $numRoot roots..\n";
# 
# foreach my $r (keys %roots){
# 	delete $roots{$r} if($roots{$r} < 3);
# }
# $numRoot = keys %roots;
# print STDERR "Left with $numRoot roots..\n";
# 
# 
# %suffixes = ();
# $loopc = 0;
# foreach my $a (keys %vtokens){
# 	$loopc++;print STDERR $loopc,"\n" if($loopc % 2000 == 0);
# 	foreach my $r (keys %roots){
# 		next unless($a =~ /^$r.+/);
# 		$a =~ /^$r(.+)$/;
# 		push @{$suffixes{$1}}, $r;
# # 		last;
# 	}
# }
# $numSuffix = keys %suffixes;
# print STDERR "Collected $numSuffix suffixes..\n";
# 
# foreach my $s (keys %suffixes){
# 	delete $suffixes{$s} if($#{$suffixes{$s}} < 5);
# }
# $numSuffix = keys %suffixes;
# print STDERR "Left with $numSuffix suffixes..\n";

# %roots = ();
# $loopc = 0;
# foreach my $a (keys %vtokens){
# 	$loopc++;print STDERR $loopc,"\n" if($loopc % 5000 == 0);
# 	foreach my $s (keys %suffixes){
# 		next unless($a =~ /.+$s$/);
# 		$a =~ /^(.+)$s$/;
# 		$roots{$1}++;
# 		last;
# 	}
# }
# $numRoot = keys %roots;
# print STDERR "Collected $numRoot roots..\n";
# 
# foreach my $r (keys %roots){
# 	delete $roots{$r} if($roots{$r} < 2);
# }
# $numRoot = keys %roots;
# print STDERR "Left with $numRoot roots..\n";

# print STDERR "Printing Suffixes and Roots";
# 
# foreach my $s (sort {$#{$suffixes{$b}} <=> $#{$suffixes{$a}}} keys %suffixes){
# 	print "$s ",join(' ',@{$suffixes{$s}}),"\n";
# }
# print "\n\n";
# foreach my $r (sort {$roots{$b} <=> $roots{$a}} keys %roots){
# 	print "$r $roots{$r}\n";
# }
