#!/usr/local/bin/perl

## From a directory having files in pos format, collect a lexicon with POS

use strict;
use warnings;

binmode(STDOUT,":utf8");
my $dir = shift;
my $cat = shift;
my $cutoff = shift;

my %lexicon = ();
my $count = 0;

opendir(UDIR,$dir) or die('Failed to open the urdu Dir !\n');
while(defined(my $file = readdir(UDIR))){
	next unless($file =~ /\.tagged$/);
	open(TFILE,"<:encoding(utf-8)","$dir/$file") or die("Couldn't open file $dir/$file\n");
	while(<TFILE>){
		chomp;
		next if(/^\s*$/);
		(my $word, my $tag) = split /\s+/;
		# Clean out the punctuation tags
		$word =~ s/[0123456789\+\-\,*\(\)\/%=<>#"'\?;\x{060C}\x{061B}\x{061F}\x{06D4}]//g;
		$word =~ s/[\x{2026}\.\x{2018}\x{2019}\x{201A}\x{201B}]//g; # elipsis and directional quotes
		$word =~ s/[\x{064A}\x{0649}]$/\x{06CC}/;
		${$lexicon{$word}}{$tag} = 0 unless(defined ${$lexicon{$word}}{$tag});
		${$lexicon{$word}}{$tag}++;
	}
	close TFILE;
	$count++;
	print STDERR $count,"\n" if($count % 1000 == 0);
# 	last if($count % $cutoff == 0);
}

# # Find all the words tagged with a closed class tag
# my @words_tagged_as_cat = grep { defined( ${$lexicon{$_}}{$cat} ) } keys %lexicon;
# # my @words_tagged_as_DET = grep { defined( ${$lexicon{$_}}{'DET'} ) } keys %lexicon;
# 
# foreach (sort {${$lexicon{$b}}{$cat} <=> ${$lexicon{$a}}{$cat}} @words_tagged_as_cat){
# 	my $max = 0;
# 	foreach my $tag (keys %{$lexicon{$_}}){
# 		$max = ${$lexicon{$_}}{$tag} if(${$lexicon{$_}}{$tag} > $max);
# 	}
# 	print "$_ ${$lexicon{$_}}{$cat} $cat\n" if (${$lexicon{$_}}{$cat} == $max and $max > 10);
# }


foreach (sort keys %lexicon){
	my @pos = keys %{$lexicon{$_}};
	my $total = 0;
	foreach my $posKey (@pos){
		$total += ${$lexicon{$_}}{$posKey};
	}
	print $_;
	foreach my $posKey (@pos){
		print " $posKey ",${$lexicon{$_}}{$posKey}/$total,' ',$total if(${$lexicon{$_}}{$posKey}/$total > .09 or (${$lexicon{$_}}{$posKey}/$total > .05 and  $total > 20000));
	}
	print "\n";
}
