#!/usr/local/bin/perl

# ./extract-vocabulary.pl < ../corpus/elicitation-EN.txt.rest.clean >! ../corpus/elicitation-EN.txt.rest.clean.voc

# ./extract-vocabulary.pl < ../corpus/input-xfer >! ../corpus/input-xfer.voc

# To merge two vocabularies into 1, do:
# cat elicitation-corpus-EN.txt elicitation-rest > EC


%types;
$count = 0;
while (<>){
    $line = $_;
    $line =~ tr/A-Z/a-z/;
    @words = split " ", $line;
    foreach $word (@words) {
#	print STDOUT "word is now $word\n";
	$count++;
	$types{$word}++;
    }
}

# need to order them by frequency: higher values of tokens first

$numt = 0;
foreach $type (keys(%types)) {
    $numt++;
    # type    token
    print STDOUT "$type\t\t$types{$type}\n";
}

# TOTAL number of tokens (corpus size: total number of words) + number of types (vocabulary size)
print STDOUT "\n***Stats***\nnumber of words: $count\nnumber of types: $numt\n";
