#!/usr/local/bin/perl ## Sentence end pos n-grams use strict; use warnings; my $fileName = shift; my %originalCount = (); my %stemmedCount = (); my %tagCounts = (); open(IFILE, $fileName) or die("Couldn't open the file $fileName\n"); while(){ chomp; my @tokens = split /\s+/; for(my $i = 0; $i <= $#tokens; $i++){ (my $word, my $tag) = split /_/,$tokens[$i]; $originalCount{$word}++; my $root = $word; if($tag eq 'NOUN'){ $root = $1 if($word =~ /(.+)(و|ے|ی)ں$/); } elsif($tag eq 'VERB'){ $root = 'گا' if($word =~ /^گ(ے|ی)$/); $root = $1 if($word =~ /(.+)(ت|ن)(ے|ی|ا)$/); # ta te ti na ne ni $root = $1 if($word =~ /(.+)ئ(ے|ی)$/); # ye yi $root = $1 if($word =~ /(.+)یے$/); # ye $root = $1 if($word =~ /(.+)یا$/); # ya $root = $1 if($word =~ /(.+)ئیں$/); # yiN $root = $1 if($word =~ /(.+)ئیے$/);#yie $root = $1 if($word =~ /(.+)سک(ت)(ے|ی|ا)$/); # skte skti skta $root = 'تھا' if($word =~ /^تھی$/); $root = 'تھے' if($word =~ /^تھیں$/); } $stemmedCount{$root}++; $tagCounts{"$root-$tag"}++; # print $root.' '; } # print "\n"; } close IFILE; # my $originalLex = keys %originalCount; # my $stemmedLex = keys %stemmedCount; # # print STDERR "$originalLex $stemmedLex\n"; # foreach (sort keys %stemmedCount){ # print "$_ $stemmedCount{$_} $originalCount{$_}\n" if(defined $originalCount{$_} and ($stemmedCount{$_} != $originalCount{$_})); # } foreach (sort {$tagCounts{$b} <=> $tagCounts{$a}} keys %tagCounts){ print "$_\n" if($tagCounts{$_} >= 1000); }