00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025
00026 #include "toolkit.h"
00027 #include "rr_libs/general.h"
00028 #include "pc_libs/pc_general.h"
00029
00030 #define DEFAULT_MAX_RECORDS 3000000
00031
00039 typedef struct {
00040 char *word;
00041 int count;
00042 } word_rec;
00043
00044 int sort_by_count(const void *rec1,const void *rec2) {
00045
00046 word_rec *r1;
00047 word_rec *r2;
00048
00049 r1 = (word_rec *) rec1;
00050 r2 = (word_rec *) rec2;
00051
00052 return(r2->count-r1->count);
00053
00054 }
00055
00056 int sort_alpha(const void *rec1,const void *rec2) {
00057
00058 word_rec *r1;
00059 word_rec *r2;
00060
00061 char *s1;
00062 char *s2;
00063
00064 r1 = (word_rec *) rec1;
00065 r2 = (word_rec *) rec2;
00066
00067 s1 = r1->word;
00068 s2 = r2->word;
00069
00070 return (strcmp(s1,s2));
00071
00072 }
00073
00074
00075
00076
00077
00078 void main(int argc, char *argv[]) {
00079
00080 int verbosity;
00081 int vocab_size;
00082 int cutoff;
00083 int num_recs;
00084 int current_rec;
00085 int num_above_threshold;
00086 int num_to_output;
00087 int i;
00088 word_rec *records;
00089 char temp_word[750];
00090 flag gt_set;
00091 flag top_set;
00092
00093
00094
00095 report_version(&argc,argv);
00096
00097 if (pc_flagarg( &argc, argv,"-help")) {
00098 fprintf(stderr,"wfreq2vocab : Generate a vocabulary file from a word frequency file.\n");
00099 fprintf(stderr,"Usage : wfreq2vocab [ -top 20000 | -gt 10]\n");
00100 fprintf(stderr," [ -records %d ]\n",DEFAULT_MAX_RECORDS);
00101 fprintf(stderr," [ -verbosity %d]\n",DEFAULT_VERBOSITY);
00102 fprintf(stderr," < .wfreq > .vocab\n");
00103 exit(1);
00104 }
00105
00106 cutoff = pc_intarg( &argc, argv, "-gt",-1);
00107 vocab_size = pc_intarg(&argc, argv, "-top",-1);
00108 num_recs = pc_intarg(&argc, argv, "-records",DEFAULT_MAX_RECORDS);
00109 verbosity = pc_intarg(&argc, argv, "-verbosity",DEFAULT_VERBOSITY);
00110
00111 pc_report_unk_args(&argc,argv,verbosity);
00112
00113 if (cutoff != -1) {
00114 gt_set = 1;
00115 }
00116 else {
00117 gt_set = 0;
00118 cutoff = 0;
00119 }
00120
00121 if (vocab_size != -1) {
00122 top_set = 1;
00123 }
00124 else {
00125 top_set = 0;
00126 vocab_size = 0;
00127 }
00128
00129 if (gt_set && top_set) {
00130 quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");
00131 }
00132
00133
00134
00135 if (!gt_set && !top_set) {
00136 vocab_size = 20000;
00137 }
00138
00139 if (gt_set) {
00140 pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
00141 }
00142 else {
00143 pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);
00144 }
00145
00146
00147 records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);
00148
00149 current_rec = 0;
00150 num_above_threshold = 0;
00151
00152 while (!rr_feof(stdin)) {
00153
00154 if (scanf("%s %d",temp_word,&(records[current_rec].count)) != 2) {
00155 if (!rr_feof(stdin)) {
00156 quit(-1,"Error reading unigram counts from standard input.\n");
00157 }
00158 }
00159 else {
00160 records[current_rec].word = salloc(temp_word);
00161 if (gt_set && records[current_rec].count > cutoff) {
00162 num_above_threshold++;
00163 }
00164 current_rec++;
00165 }
00166 }
00167
00168
00169
00170 qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);
00171
00172 if (gt_set) {
00173 num_to_output = num_above_threshold;
00174 }
00175 else {
00176 num_to_output = vocab_size;
00177 }
00178
00179 if (current_rec<num_to_output) {
00180 num_to_output = current_rec;
00181 }
00182
00183
00184
00185 qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);
00186
00187 if (gt_set) {
00188 pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);
00189 }
00190
00191 if (num_to_output>65535) {
00192 pc_message(verbosity,1,"Warning : Vocab size exceeds 65535. This will cause problems with \nother tools, since word id's are stored in 2 bytes.\n");
00193 }
00194
00195 if (num_to_output == 0) {
00196 pc_message(verbosity,1,"Warning : Vocab size = 0.\n");
00197 }
00198
00199
00200
00201 printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
00202 printf("## Language Modeling toolkit.\n");
00203 printf("##\n");
00204 printf("## Includes %d words ",num_to_output);
00205 printf("##\n");
00206
00207 for (i=0;i<=num_to_output-1;i++) {
00208 printf("%s\n",records[i].word);
00209 }
00210
00211 pc_message(verbosity,0,"wfreq2vocab : Done.\n");
00212
00213 exit(0);
00214
00215 }
00216
00217
00218
00219