#!/usr/local/bin/perl

#open necessary files
open (InFile, $ARGV[0]) or
    die "couldn't open input file";

open (OutFile, ">".$ARGV[1]);

#read in data
@Corpus = <InFile>;

#actual count
#count word types and word tokens
$NumTokens = 0;
$NumTypes = 0;
foreach $Line (@Corpus)
{
    #first get words
    #$Line = StopList($Line);
    @CurWords = split(/\s+/, $Line);
    #loop through the words
    foreach $Word (@CurWords)
    {
	if($Word =~ /[a-zA-Z]/) #do not count empty "words"
	{
	$NumTokens++;
	#see whether word is already in the types list
	if (!$Tokens{$Word})
	{
	    #this word hasn't been seen yet, so insert it in the hash
	    $Tokens{$Word} = 1;
	    $NumTypes++;
	    $TypeToken{$NumTokens} = $NumTypes; #this is for the type-token graph
	}
	else 
	{
	    #the word is already in the hash, so increase its type count by one
	    $Tokens{$Word}++;
	}
        }
    }
}

#print the results

#head and tail results
print "Total number of words: $NumTokens\n\n\n";

print "   Count        Estimate       WordType\n
=============  =============  ==============\n\n
Head:\n";

$Pos = 0;
foreach $key (sort Descending (keys(%Tokens))) 
{
  if (($Pos < 20) || ($Pos > $NumTypes - 20))
  {
  $Est = $Tokens{$key} / $NumTokens;
  printf "%10d \t %4f \t $key\n", $Tokens{$key}, $Est;
  }
  elsif ($Pos == $NumTypes - 20)
  {
      print "\n\nTail:\n";
      printf "%10d \t %4f \t $key\n", $Tokens{$key}, $Est;
  }
  $Pos++;
}

#type-token results
foreach $key (sort By_Number (keys %TypeToken))
{
    print OutFile "$key;;$TypeToken{$key}\n";
}


#helper procedures

#stoplist
sub StopList
{
    my($Cur) = @_;
    $Cur =~ s/(\<|\>|\*|\-|\.|\;|\:|\,|\?|\!|\(|\)|\"|\'|\&|\$|\/|\=|\%|\_|[0-9])//g;
    return $Cur;
}

#to sort hash table by value rather than key
sub Descending
{
  $Tokens{$b} <=> $Tokens{$a};  
}

#for numeric sort of type-token results
sub By_Number
{
    if ($a < $b)
    {
	return -1;
    }
    elsif ($a == $b)
    {
	return 0;
    }
    elsif ($a > $b)
    {
	return 1;
    }
}
