00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00025 #define DEFAULT_HASH_SIZE 200000
00026 #define DEFAULT_MAX_FILES 20
00027 #define MAX_N 20
00028 #define TEMP_FILE_ROOT "wngram2idngram.temp."
00029
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <string.h>
00033 #include <unistd.h>
00034 #include <sys/utsname.h>
00035 #include <sys/types.h>
00036 #include "toolkit.h"
00037 #include "rr_libs/general.h"
00038 #include "pc_libs/pc_general.h"
00039 #include "idngram.h"
00040
00041 typedef struct {
00042 unsigned short *word;
00043 int count;
00044 } ngram_rec;
00045
00046 int compare_ngrams2(const void *ngram1,
00047 const void *ngram2) {
00048
00049 int temp;
00050 int i;
00051 ngram_rec *r1;
00052 ngram_rec *r2;
00053
00054 r1 = (ngram_rec *) ngram1;
00055 r2 = (ngram_rec *) ngram2;
00056
00057 temp = 0;
00058
00059 for (i=0;i<=n-1;i++) {
00060 if ((r1->word[i]) < (r2->word[i])) {
00061 temp = -1;
00062 i = n;
00063 }
00064 else {
00065 if ((r1->word[i]) > (r2->word[i])) {
00066 temp = 1;
00067 i = n;
00068 }
00069 }
00070 }
00071
00072 return(temp);
00073
00074 }
00075
00076 void main(int argc, char *argv[]) {
00077
00078 int verbosity;
00079 int vocab_size;
00080 FILE *vocab_file;
00081 int buffer_size;
00082 flag write_ascii;
00083 int max_files;
00084 int number_of_tempfiles;
00085 char *vocab_filename;
00086 char tempfiles_directory[1000];
00087 char temp_word[MAX_WORD_LENGTH];
00088 char temp_word2[MAX_WORD_LENGTH];
00089 char temp_word3[MAX_WORD_LENGTH];
00090 flag contains_unks;
00091 int position_in_buffer;
00092 FILE *tempfile;
00093 FILE *non_unk_fp;
00094 ngram_rec *buffer;
00095 flag same_ngram;
00096 int i;
00097 int j;
00098 int nlines;
00099 int fof_size;
00100 int size_of_rec;
00101 char *temp_file_root;
00102 char *temp_file_ext;
00103 char *host_name;
00104 struct utsname uname_info;
00105 int proc_id;
00106
00107
00108
00109 struct hash_table vocabulary;
00110 unsigned long hash_size;
00111 unsigned long M;
00112
00113 unsigned short *current_ngram;
00114 int current_count;
00115 unsigned short *sort_ngram;
00116 int sort_count;
00117
00118
00119
00120 report_version(&argc,argv);
00121
00122 if (pc_flagarg( &argc, argv,"-help") || argc==1) {
00123 fprintf(stderr,"wngram2idngram - Convert a word n-gram file to an id n-gram file.\n");
00124 fprintf(stderr,"Usage : wngram2idngram -vocab .vocab\n");
00125 fprintf(stderr," [ -buffer %d ] \n",STD_MEM);
00126 fprintf(stderr," [ -hash %d ]\n",DEFAULT_HASH_SIZE);
00127 fprintf(stderr," [ -temp %s ]\n",DEFAULT_TEMP);
00128 fprintf(stderr," [ -files %d ]\n",DEFAULT_MAX_FILES);
00129 fprintf(stderr," [ -gzip | -compress ]\n");
00130 fprintf(stderr," [ -verbosity 2 ]\n");
00131 fprintf(stderr," [ -n 3 ]\n");
00132 fprintf(stderr," [ -write_ascii ]\n");
00133 fprintf(stderr," [ -fof_size 10 ]\n");
00134 fprintf(stderr," < .wngram > .idngram\n");
00135 exit(1);
00136 }
00137
00138 n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
00139
00140 hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
00141 buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
00142
00143 write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
00144
00145 verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00146
00147 max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
00148 fof_size = pc_intarg(&argc,argv,"-fof_size",10);
00149 vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
00150
00151 if (!strcmp("",vocab_filename)) {
00152 quit(-1,"Error : Must specify a vocabulary file.\n");
00153 }
00154
00155 strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp",DEFAULT_TEMP));
00156
00157
00158 if (pc_flagarg(&argc,argv,"-compress")) {
00159 temp_file_ext = salloc(".Z");
00160 }
00161 else {
00162 if (pc_flagarg(&argc,argv,"-gzip")) {
00163 temp_file_ext = salloc(".gz");
00164 }
00165 else {
00166 temp_file_ext = salloc("");
00167 }
00168 }
00169
00170 uname(&uname_info);
00171
00172 host_name = salloc(uname_info.nodename);
00173
00174 proc_id = getpid();
00175
00176 sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);
00177
00178 temp_file_root = salloc(temp_word);
00179
00180
00181 pc_report_unk_args(&argc,argv,verbosity);
00182
00183
00184
00185 if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
00186 strcat(tempfiles_directory,"/");
00187 }
00188
00189 pc_message(verbosity,2,"Vocab : %s\n",vocab_filename);
00190 pc_message(verbosity,2,"Buffer size : %d\n",buffer_size);
00191 pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
00192 pc_message(verbosity,2,"Temp directory : %s\n",tempfiles_directory);
00193 pc_message(verbosity,2,"Max open files : %d\n",max_files);
00194 pc_message(verbosity,2,"n : %d\n",n);
00195 pc_message(verbosity,2,"FOF size : %d\n",fof_size);
00196
00197 size_of_rec = (sizeof(unsigned short) * n) + 16 -
00198 ((n*sizeof(unsigned short)) % 16);
00199
00200 buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
00201 fprintf(stderr,"buffer size = %d\n",buffer_size);
00202
00203
00204
00205 fprintf(stderr,"Initialising hash table...\n");
00206
00207 M = nearest_prime(hash_size);
00208
00209 new_hashtable(&vocabulary,M);
00210
00211
00212
00213 vocab_size = 0;
00214
00215 vocab_file = rr_iopen(vocab_filename);
00216
00217 pc_message(verbosity,2,"Reading vocabulary...\n");
00218
00219 while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
00220 if (strncmp(temp_word,"##",2)==0) continue;
00221 sscanf (temp_word, "%s ",temp_word2);
00222
00223
00224
00225 if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) {
00226 quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");
00227 }
00228
00229
00230
00231 if (index2(&vocabulary,temp_word2) != 0) {
00232 fprintf(stderr,"======================================================\n");
00233 fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
00234 fprintf(stderr,"=======================================================\n");
00235 }
00236
00237 if (strncmp(temp_word,"#",1)==0) {
00238 fprintf(stderr,"\n\n===========================================================\n");
00239 fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00240 fprintf(stderr, ">>> %s <<<\n",temp_word);
00241 fprintf(stderr, " '%s' will be included in the context cues list\n",temp_word2);
00242 fprintf(stderr, " (comments must start with '##')\n");
00243 fprintf(stderr,"===========================================================\n\n");
00244 }
00245 vocab_size++;
00246 add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
00247 strcpy(temp_word3,temp_word2);
00248 }
00249
00250 if (vocab_size > MAX_VOCAB_SIZE) {
00251 quit(-1,"Error : Vocabulary size exceeds maximum.\n");
00252 }
00253
00254 pc_message(verbosity,2,"Allocating memory for the buffer...\n");
00255
00256 buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
00257
00258 for (i=0;i<=buffer_size;i++) {
00259 buffer[i].word = (unsigned short *) rr_malloc(n*sizeof(unsigned short));
00260 }
00261
00262
00263
00264 sprintf(temp_word,"%s%s1%s",tempfiles_directory,temp_file_root,temp_file_ext);
00265
00266 non_unk_fp = rr_fopen(temp_word,"w");
00267
00268 pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
00269 temp_word);
00270 number_of_tempfiles = 1;
00271
00272 current_ngram = (unsigned short *) rr_malloc(n*sizeof(unsigned short));
00273 sort_ngram = (unsigned short *) rr_malloc(n*sizeof(unsigned short));
00274
00275
00276
00277 nlines = 0;
00278
00279 position_in_buffer = 0;
00280
00281 while (!rr_feof(stdin)) {
00282
00283 for (i=0;i<=n-1;i++) {
00284 get_word(stdin,temp_word);
00285 current_ngram[i]=index2(&vocabulary,temp_word);
00286 }
00287 if (scanf("%d",¤t_count) != 1) {
00288 if (!rr_feof(stdin)) {
00289 quit(-1,"Error reading n-gram count from stdin.\n");
00290 }
00291 }
00292
00293 if (!rr_feof(stdin)) {
00294
00295 contains_unks = 0;
00296 for (i=0;i<=n-1;i++) {
00297 if (!current_ngram[i]) {
00298 contains_unks = 1;
00299 }
00300 }
00301
00302 if (contains_unks) {
00303
00304
00305
00306 position_in_buffer++;
00307
00308 if (position_in_buffer >= buffer_size) {
00309
00310
00311
00312 pc_message(verbosity,2,
00313 "Sorting n-grams which include an OOV word...\n");
00314
00315 qsort((void*) buffer,(size_t) position_in_buffer,
00316 sizeof(ngram_rec),compare_ngrams2);
00317
00318 pc_message(verbosity,2,"Done.\n");
00319
00320
00321
00322 number_of_tempfiles++;
00323
00324 sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00325 number_of_tempfiles,temp_file_ext);
00326
00327 pc_message(verbosity,2,
00328 "Writing sorted OOV-counts buffer to temporary file %s\n",
00329 temp_word);
00330
00331 tempfile = rr_fopen(temp_word,"w");
00332
00333 for (i=0;i<=n-1;i++) {
00334 sort_ngram[i] = buffer[0].word[i];
00335 }
00336 sort_count = buffer[0].count;
00337
00338 for (i=0;i<=position_in_buffer-2;i++) {
00339
00340 same_ngram = 1;
00341 for (j=n-1;j>=0;j--) {
00342 if (buffer[i].word[j] != sort_ngram[j]) {
00343 same_ngram = 0;
00344 j = -1;
00345 }
00346 }
00347
00348 if (same_ngram) {
00349 sort_count += buffer[i].count;
00350 }
00351 else {
00352 for (j=0;j<=n-1;j++) {
00353 rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00354 tempfile,"temporary n-gram ids");
00355 sort_ngram[j] = buffer[i].word[j];
00356 }
00357 rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00358 "temporary n-gram counts");
00359 sort_count = buffer[i].count;
00360 }
00361 }
00362 for (j=0;j<=n-1;j++) {
00363 rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00364 tempfile,"temporary n-gram ids");
00365 }
00366 rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00367 "temporary n-gram counts");
00368 rr_oclose(tempfile);
00369 position_in_buffer = 1;
00370
00371 }
00372
00373 for (i=0;i<=n-1;i++) {
00374 buffer[position_in_buffer-1].word[i] = current_ngram[i];
00375 }
00376
00377 buffer[position_in_buffer-1].count = current_count;
00378
00379 }
00380
00381 else {
00382
00383
00384
00385 for (i=0;i<=n-1;i++) {
00386 rr_fwrite(¤t_ngram[i],sizeof(unsigned short),1,
00387 non_unk_fp,"temporary n-gram ids");
00388
00389 }
00390 rr_fwrite(¤t_count,sizeof(int),1,non_unk_fp,
00391 "temporary n-gram counts");
00392
00393 }
00394
00395 }
00396
00397 }
00398
00399 if (position_in_buffer > 0) {
00400
00401
00402
00403
00404
00405 pc_message(verbosity,2,"Sorting final buffer...\n");
00406
00407 qsort((void*) buffer,(size_t) position_in_buffer,
00408 sizeof(ngram_rec),compare_ngrams2);
00409
00410
00411
00412 number_of_tempfiles++;
00413
00414 sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00415 number_of_tempfiles,temp_file_ext);
00416
00417 pc_message(verbosity,2,
00418 "Writing sorted buffer to temporary file %s\n",
00419 temp_word);
00420
00421 tempfile = rr_fopen(temp_word,"w");
00422
00423 for (i=0;i<=n-1;i++) {
00424 sort_ngram[i] = buffer[0].word[i];
00425 }
00426 sort_count = buffer[0].count;
00427
00428 for (i=1;i<=position_in_buffer-1;i++) {
00429
00430 same_ngram = 1;
00431 for (j=n-1;j>=0;j--) {
00432 if (buffer[i].word[j] != sort_ngram[j]) {
00433 same_ngram = 0;
00434 j = -1;
00435 }
00436 }
00437
00438 if (same_ngram) {
00439 sort_count += buffer[i].count;
00440 }
00441 else {
00442 for (j=0;j<=n-1;j++) {
00443 rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00444 tempfile,"temporary n-gram ids");
00445 sort_ngram[j] = buffer[i].word[j];
00446 }
00447 rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00448 "temporary n-gram counts");
00449 sort_count = buffer[i].count;
00450 }
00451 }
00452 for (j=0;j<=n-1;j++) {
00453 rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00454 tempfile,"temporary n-gram ids");
00455 }
00456 rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00457 "temporary n-gram counts");
00458 fclose(tempfile);
00459
00460
00461
00462 fclose(non_unk_fp);
00463
00464 pc_message(verbosity,2,"Merging temporary files...\n");
00465
00466 merge_tempfiles(1,
00467 number_of_tempfiles,
00468 temp_file_root,
00469 temp_file_ext,
00470 max_files,
00471 tempfiles_directory,
00472 stdout,
00473 write_ascii,
00474 fof_size);
00475 }
00476
00477 else {
00478
00479
00480
00481 fclose(non_unk_fp);
00482
00483 merge_tempfiles(1,
00484 1,
00485 temp_file_root,
00486 temp_file_ext,
00487 max_files,
00488 tempfiles_directory,
00489 stdout,
00490 write_ascii,
00491 fof_size);
00492 }
00493
00494 pc_message(verbosity,0,"wngram2idngram : Done.\n");
00495
00496 exit(0);
00497
00498 }
00499