wfreq2vocab.c Source File

00001 
00002 
00003 /*=====================================================================
00004                 =======   COPYRIGHT NOTICE   =======
00005 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00006 Ronald Rosenfeld and Philip Clarkson.
00007 
00008 All rights reserved.
00009 
00010 This software is made available for research purposes only.  It may be
00011 redistributed freely for this purpose, in full or in part, provided
00012 that this entire copyright notice is included on any copies of this
00013 software and applications and derivations thereof.
00014 
00015 This software is provided on an "as is" basis, without warranty of any
00016 kind, either expressed or implied, as to any matter including, but not
00017 limited to warranty of fitness of purpose, or merchantability, or
00018 results obtained from use of this software.
00019 ======================================================================*/
00020 
00021 
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 
00026 #include "toolkit.h"
00027 #include "rr_libs/general.h"
00028 #include "pc_libs/pc_general.h"
00029 
00030 #define DEFAULT_MAX_RECORDS 3000000
00031 
00039 typedef struct {
00040   char *word;
00041   int count;
00042 } word_rec;
00043 
00044 int sort_by_count(const void *rec1,const void *rec2) {
00045 
00046   word_rec *r1;
00047   word_rec *r2;
00048   
00049   r1 = (word_rec *) rec1;
00050   r2 = (word_rec *) rec2;
00051 
00052   return(r2->count-r1->count);
00053 
00054 }                 
00055 
00056 int sort_alpha(const void *rec1,const void *rec2) {
00057  
00058   word_rec *r1;
00059   word_rec *r2;
00060  
00061   char *s1;
00062   char *s2;
00063   
00064   r1 = (word_rec *) rec1;
00065   r2 = (word_rec *) rec2;
00066   
00067   s1 = r1->word;
00068   s2 = r2->word;
00069 
00070   return (strcmp(s1,s2));
00071  
00072 }
00073 
00074 /***************************
00075       MAIN FUNCTION
00076  ***************************/
00077 
00078 void main(int argc, char *argv[]) {
00079 
00080   int verbosity;
00081   int vocab_size;
00082   int cutoff;
00083   int num_recs;
00084   int current_rec;
00085   int num_above_threshold;
00086   int num_to_output;
00087   int i;
00088   word_rec *records;
00089   char temp_word[750];
00090   flag gt_set;
00091   flag top_set;
00092 
00093   /* Process command line */
00094 
00095   report_version(&argc,argv);
00096 
00097   if (pc_flagarg( &argc, argv,"-help")) {
00098     fprintf(stderr,"wfreq2vocab : Generate a vocabulary file from a word frequency file.\n");
00099     fprintf(stderr,"Usage : wfreq2vocab [ -top 20000 | -gt 10]\n");
00100     fprintf(stderr,"                    [ -records %d ]\n",DEFAULT_MAX_RECORDS);
00101     fprintf(stderr,"                    [ -verbosity %d]\n",DEFAULT_VERBOSITY);
00102     fprintf(stderr,"                    < .wfreq > .vocab\n");
00103     exit(1);
00104   }
00105 
00106   cutoff = pc_intarg( &argc, argv, "-gt",-1);
00107   vocab_size = pc_intarg(&argc, argv, "-top",-1);
00108   num_recs = pc_intarg(&argc, argv, "-records",DEFAULT_MAX_RECORDS);
00109   verbosity = pc_intarg(&argc, argv, "-verbosity",DEFAULT_VERBOSITY);
00110   
00111   pc_report_unk_args(&argc,argv,verbosity);
00112 
00113   if (cutoff != -1) {
00114     gt_set = 1;
00115   }
00116   else {
00117     gt_set = 0;
00118     cutoff = 0;
00119   }
00120 
00121   if (vocab_size != -1) {
00122     top_set = 1;
00123   }
00124   else {
00125     top_set = 0;
00126     vocab_size = 0;
00127   }
00128   
00129   if (gt_set && top_set) {
00130     quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");
00131   }
00132 
00133 
00134 
00135   if (!gt_set && !top_set) {
00136     vocab_size = 20000;
00137   }
00138 
00139   if (gt_set) {
00140     pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
00141   }
00142   else {
00143     pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);
00144   }
00145 
00146 
00147   records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);
00148 
00149   current_rec = 0;
00150   num_above_threshold = 0;
00151   
00152   while (!rr_feof(stdin)) {
00153 
00154     if (scanf("%s %d",temp_word,&(records[current_rec].count)) != 2) {
00155       if (!rr_feof(stdin)) {
00156         quit(-1,"Error reading unigram counts from standard input.\n");
00157       }
00158     }
00159     else {
00160       records[current_rec].word = salloc(temp_word);
00161       if (gt_set && records[current_rec].count > cutoff) {
00162         num_above_threshold++;
00163       }
00164       current_rec++;
00165     }
00166   }
00167 
00168   /* Sort records in descending order of count */
00169 
00170   qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);
00171 
00172   if (gt_set) {
00173     num_to_output = num_above_threshold;
00174   }
00175   else {
00176     num_to_output = vocab_size;
00177   }
00178 
00179   if (current_rec<num_to_output) {
00180     num_to_output = current_rec;
00181   }
00182 
00183   /* Now sort the relevant records alphabetically */
00184 
00185   qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);
00186 
00187   if (gt_set) {
00188     pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);
00189   }
00190   
00191   if (num_to_output>65535) {
00192     pc_message(verbosity,1,"Warning : Vocab size exceeds 65535. This will cause problems with \nother tools, since word id's are stored in 2 bytes.\n");
00193   }
00194 
00195   if (num_to_output == 0) {
00196     pc_message(verbosity,1,"Warning : Vocab size = 0.\n");
00197   }
00198 
00199   /* Print the vocab to stdout */
00200   
00201   printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
00202   printf("## Language Modeling toolkit.\n");
00203   printf("##\n");
00204   printf("## Includes %d words ",num_to_output);
00205   printf("##\n");
00206 
00207   for (i=0;i<=num_to_output-1;i++) {
00208     printf("%s\n",records[i].word);
00209   }
00210 
00211   pc_message(verbosity,0,"wfreq2vocab : Done.\n");
00212 
00213   exit(0);
00214 
00215 }  
00216     
00217 
00218 
00219