#!/usr/local/bin/perl

## Sentence end pos n-grams

use strict;
use warnings;

my $fileName = shift;

my %originalCount = ();
my %stemmedCount = ();
my %tagCounts = ();
open(IFILE, $fileName) or die("Couldn't open the file $fileName\n");
while(<IFILE>){
	chomp;
	my @tokens = split /\s+/;
	for(my $i = 0; $i <= $#tokens; $i++){
		(my $word, my $tag) = split /_/,$tokens[$i];
		$originalCount{$word}++;
		my $root = $word;
		if($tag eq 'NOUN'){
			$root = $1 if($word =~ /(.+)(و|ے|ی)ں$/);
		}
		elsif($tag eq 'VERB'){
			$root = 'گا' if($word =~ /^گ(ے|ی)$/);
			$root = $1 if($word =~ /(.+)(ت|ن)(ے|ی|ا)$/);  # ta te ti na ne ni
			$root = $1 if($word =~ /(.+)ئ(ے|ی)$/); # ye yi
			$root = $1 if($word =~ /(.+)یے$/); # ye
			$root = $1 if($word =~ /(.+)یا$/); # ya
			$root = $1 if($word =~ /(.+)ئیں$/); # yiN
			$root = $1 if($word =~ /(.+)ئیے$/);#yie
			$root = $1 if($word =~ /(.+)سک(ت)(ے|ی|ا)$/); # skte skti skta
			$root = 'تھا' if($word =~ /^تھی$/);
			$root = 'تھے' if($word =~ /^تھیں$/);
		}
		$stemmedCount{$root}++;
		$tagCounts{"$root-$tag"}++;
# 		print $root.' ';
	}
# 	print "\n";
}
close IFILE;

# my $originalLex = keys %originalCount;
# my $stemmedLex = keys %stemmedCount;
# 
# print STDERR "$originalLex $stemmedLex\n";

# foreach (sort keys %stemmedCount){
# 	print "$_ $stemmedCount{$_} $originalCount{$_}\n" if(defined $originalCount{$_} and ($stemmedCount{$_} != $originalCount{$_}));
# }

foreach (sort {$tagCounts{$b} <=> $tagCounts{$a}} keys %tagCounts){
	print "$_\n" if($tagCounts{$_} >= 1000);
}
