Main Page   Compound List   File List   Compound Members   File Members  

wngram2idngram.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00025 #define DEFAULT_HASH_SIZE 200000
00026 #define DEFAULT_MAX_FILES 20
00027 #define MAX_N 20
00028 #define TEMP_FILE_ROOT "wngram2idngram.temp."
00029 
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <string.h>
00033 #include <unistd.h>
00034 #include <sys/utsname.h>
00035 #include <sys/types.h>
00036 #include "toolkit.h"
00037 #include "rr_libs/general.h"
00038 #include "pc_libs/pc_general.h"
00039 #include "idngram.h"
00040 
00041 typedef struct {
00042   unsigned short *word;
00043   int count;
00044 } ngram_rec;
00045 
00046 int compare_ngrams2(const void *ngram1,
00047                     const void *ngram2) {
00048 
00049   int temp;
00050   int i;
00051   ngram_rec *r1;
00052   ngram_rec *r2;
00053 
00054   r1 = (ngram_rec *) ngram1;
00055   r2 = (ngram_rec *) ngram2;
00056   
00057   temp = 0;
00058 
00059   for (i=0;i<=n-1;i++) {
00060     if ((r1->word[i]) < (r2->word[i])) {
00061       temp = -1;
00062       i = n;
00063     }
00064     else {
00065       if ((r1->word[i]) > (r2->word[i])) {
00066         temp = 1;
00067         i = n;
00068       }
00069     }
00070   }
00071 
00072   return(temp);
00073 
00074 }
00075 
00076 void main(int argc, char *argv[]) {
00077 
00078   int verbosity;
00079   int vocab_size;
00080   FILE *vocab_file;
00081   int buffer_size;
00082   flag write_ascii;
00083   int max_files;
00084   int number_of_tempfiles;
00085   char *vocab_filename;
00086   char tempfiles_directory[1000];
00087   char temp_word[MAX_WORD_LENGTH];
00088   char temp_word2[MAX_WORD_LENGTH];
00089   char temp_word3[MAX_WORD_LENGTH];
00090   flag contains_unks;
00091   int position_in_buffer;
00092   FILE *tempfile;
00093   FILE *non_unk_fp;
00094   ngram_rec *buffer;
00095   flag same_ngram;
00096   int i;
00097   int j;
00098   int nlines;
00099   int fof_size;
00100   int size_of_rec;
00101   char *temp_file_root;
00102   char *temp_file_ext;
00103   char *host_name;
00104   struct utsname uname_info;
00105   int proc_id;
00106 
00107   /* Vocab hash table things */
00108 
00109   struct hash_table vocabulary;
00110   unsigned long hash_size;
00111   unsigned long M;
00112 
00113   unsigned short *current_ngram;
00114   int current_count;
00115   unsigned short *sort_ngram;
00116   int sort_count;
00117   
00118   /* Process command line */
00119 
00120   report_version(&argc,argv);
00121   
00122   if (pc_flagarg( &argc, argv,"-help") || argc==1) {
00123     fprintf(stderr,"wngram2idngram - Convert a word n-gram file to an id n-gram file.\n");
00124     fprintf(stderr,"Usage : wngram2idngram -vocab .vocab\n");
00125     fprintf(stderr,"                     [ -buffer %d ] \n",STD_MEM);
00126     fprintf(stderr,"                     [ -hash %d ]\n",DEFAULT_HASH_SIZE);
00127     fprintf(stderr,"                     [ -temp %s ]\n",DEFAULT_TEMP);
00128     fprintf(stderr,"                     [ -files %d ]\n",DEFAULT_MAX_FILES);
00129     fprintf(stderr,"                     [ -gzip | -compress ]\n");
00130     fprintf(stderr,"                     [ -verbosity 2 ]\n");
00131     fprintf(stderr,"                     [ -n 3 ]\n");
00132     fprintf(stderr,"                     [ -write_ascii ]\n");
00133     fprintf(stderr,"                     [ -fof_size 10 ]\n");
00134     fprintf(stderr,"                     < .wngram > .idngram\n");
00135     exit(1);
00136   }
00137 
00138   n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
00139 
00140   hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
00141   buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
00142 
00143   write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
00144 
00145   verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00146 
00147   max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
00148   fof_size = pc_intarg(&argc,argv,"-fof_size",10);
00149   vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
00150   
00151   if (!strcmp("",vocab_filename)) {
00152     quit(-1,"Error : Must specify a vocabulary file.\n");
00153   }
00154     
00155   strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp",DEFAULT_TEMP));
00156 
00157 
00158   if (pc_flagarg(&argc,argv,"-compress")) {
00159     temp_file_ext = salloc(".Z");
00160   }
00161   else {
00162     if (pc_flagarg(&argc,argv,"-gzip")) {
00163       temp_file_ext = salloc(".gz");
00164     }
00165     else {
00166       temp_file_ext = salloc("");
00167     }
00168   }
00169 
00170   uname(&uname_info);
00171 
00172   host_name = salloc(uname_info.nodename);
00173 
00174   proc_id = getpid();
00175 
00176   sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);
00177 
00178   temp_file_root = salloc(temp_word);
00179 
00180 
00181   pc_report_unk_args(&argc,argv,verbosity);
00182   
00183   /* If the last charactor in the directory name isn't a / then add one. */
00184   
00185   if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
00186     strcat(tempfiles_directory,"/");
00187   }
00188   
00189   pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
00190   pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
00191   pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
00192   pc_message(verbosity,2,"Temp directory  : %s\n",tempfiles_directory);
00193   pc_message(verbosity,2,"Max open files  : %d\n",max_files);
00194   pc_message(verbosity,2,"n               : %d\n",n);
00195   pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
00196 
00197   size_of_rec = (sizeof(unsigned short) * n) + 16 - 
00198     ((n*sizeof(unsigned short)) % 16);
00199 
00200   buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
00201   fprintf(stderr,"buffer size = %d\n",buffer_size);
00202 
00203   /* Allocate memory for hash table */
00204 
00205   fprintf(stderr,"Initialising hash table...\n");
00206 
00207   M = nearest_prime(hash_size);
00208 
00209   new_hashtable(&vocabulary,M);
00210 
00211   /* Read in the vocabulary */
00212 
00213   vocab_size = 0;
00214 
00215   vocab_file = rr_iopen(vocab_filename);
00216 
00217   pc_message(verbosity,2,"Reading vocabulary...\n");
00218 
00219   while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
00220     if (strncmp(temp_word,"##",2)==0) continue;
00221     sscanf (temp_word, "%s ",temp_word2);
00222 
00223     /* Check for vocabulary order */
00224 
00225     if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) {
00226       quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");
00227     }
00228 
00229     /* Check for repeated words in the vocabulary */
00230 
00231     if (index2(&vocabulary,temp_word2) != 0) {
00232       fprintf(stderr,"======================================================\n");
00233       fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
00234       fprintf(stderr,"=======================================================\n");
00235     }
00236 
00237     if (strncmp(temp_word,"#",1)==0) {
00238       fprintf(stderr,"\n\n===========================================================\n");
00239       fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00240       fprintf(stderr,     ">>> %s <<<\n",temp_word);
00241       fprintf(stderr,     "         '%s' will be included in the context cues list\n",temp_word2);
00242       fprintf(stderr,     "         (comments must start with '##')\n");
00243       fprintf(stderr,"===========================================================\n\n");
00244     }
00245     vocab_size++;
00246     add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
00247     strcpy(temp_word3,temp_word2);
00248   }
00249 
00250   if (vocab_size > MAX_VOCAB_SIZE) {
00251     quit(-1,"Error : Vocabulary size exceeds maximum.\n");
00252   }   
00253   
00254   pc_message(verbosity,2,"Allocating memory for the buffer...\n");
00255 
00256   buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
00257   
00258   for (i=0;i<=buffer_size;i++) {
00259     buffer[i].word = (unsigned short *) rr_malloc(n*sizeof(unsigned short));
00260   }
00261 
00262   /* Open the "non-OOV" tempfile */
00263 
00264   sprintf(temp_word,"%s%s1%s",tempfiles_directory,temp_file_root,temp_file_ext);
00265   
00266   non_unk_fp = rr_fopen(temp_word,"w");
00267 
00268   pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
00269              temp_word);
00270   number_of_tempfiles = 1;
00271 
00272   current_ngram = (unsigned short *) rr_malloc(n*sizeof(unsigned short));
00273   sort_ngram = (unsigned short *) rr_malloc(n*sizeof(unsigned short));
00274 
00275   /* Read text into buffer */
00276 
00277   nlines = 0;
00278   
00279   position_in_buffer = 0;
00280 
00281   while (!rr_feof(stdin)) {
00282     
00283     for (i=0;i<=n-1;i++) {
00284       get_word(stdin,temp_word);
00285       current_ngram[i]=index2(&vocabulary,temp_word);
00286     }
00287     if (scanf("%d",&current_count) != 1) {
00288       if (!rr_feof(stdin)) {
00289         quit(-1,"Error reading n-gram count from stdin.\n");
00290       }
00291     }
00292 
00293     if (!rr_feof(stdin)) {
00294 
00295       contains_unks = 0;
00296       for (i=0;i<=n-1;i++) {
00297         if (!current_ngram[i]) {
00298           contains_unks = 1;
00299         }
00300       }
00301 
00302       if (contains_unks) {
00303 
00304         /* Write to buffer */
00305 
00306         position_in_buffer++;
00307 
00308         if (position_in_buffer >= buffer_size) {
00309 
00310           /* Sort buffer */
00311 
00312           pc_message(verbosity,2,
00313                      "Sorting n-grams which include an OOV word...\n");
00314 
00315           qsort((void*) buffer,(size_t) position_in_buffer,
00316                 sizeof(ngram_rec),compare_ngrams2);
00317 
00318           pc_message(verbosity,2,"Done.\n");
00319 
00320           /* Write buffer to temporary file */
00321 
00322           number_of_tempfiles++;
00323           
00324           sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00325                   number_of_tempfiles,temp_file_ext);
00326           
00327           pc_message(verbosity,2,
00328                      "Writing sorted OOV-counts buffer to temporary file %s\n",
00329                      temp_word);
00330 
00331           tempfile = rr_fopen(temp_word,"w");
00332           
00333           for (i=0;i<=n-1;i++) {
00334             sort_ngram[i] = buffer[0].word[i];
00335           }
00336           sort_count = buffer[0].count;
00337 
00338           for (i=0;i<=position_in_buffer-2;i++) {
00339             
00340             same_ngram = 1;
00341             for (j=n-1;j>=0;j--) {
00342               if (buffer[i].word[j] != sort_ngram[j]) {
00343                 same_ngram = 0;
00344                 j = -1;
00345               }
00346             }
00347 
00348             if (same_ngram) {
00349               sort_count += buffer[i].count;
00350             }
00351             else {
00352               for (j=0;j<=n-1;j++) {
00353                 rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00354                           tempfile,"temporary n-gram ids");
00355                 sort_ngram[j] = buffer[i].word[j];
00356               }
00357               rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00358                         "temporary n-gram counts");
00359               sort_count = buffer[i].count;
00360             }
00361           }         
00362           for (j=0;j<=n-1;j++) {
00363             rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00364                       tempfile,"temporary n-gram ids");
00365           }
00366           rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00367                     "temporary n-gram counts");
00368           rr_oclose(tempfile);
00369           position_in_buffer = 1;
00370 
00371         }
00372         
00373         for (i=0;i<=n-1;i++) {
00374           buffer[position_in_buffer-1].word[i] = current_ngram[i];
00375         }
00376 
00377         buffer[position_in_buffer-1].count = current_count;
00378 
00379       }
00380 
00381       else {
00382 
00383         /* Write to temporary file */
00384 
00385         for (i=0;i<=n-1;i++) {
00386           rr_fwrite(&current_ngram[i],sizeof(unsigned short),1,
00387                     non_unk_fp,"temporary n-gram ids");
00388 
00389         }
00390         rr_fwrite(&current_count,sizeof(int),1,non_unk_fp,
00391                   "temporary n-gram counts");
00392 
00393       }
00394 
00395     }
00396 
00397   }
00398 
00399   if (position_in_buffer > 0) {
00400 
00401     /* Only do this bit if we have actually seen some OOVs */
00402 
00403     /* Sort final buffer */
00404     
00405     pc_message(verbosity,2,"Sorting final buffer...\n");
00406 
00407     qsort((void*) buffer,(size_t) position_in_buffer,
00408           sizeof(ngram_rec),compare_ngrams2);
00409     
00410     /* Write final buffer */
00411     
00412     number_of_tempfiles++;
00413   
00414     sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00415             number_of_tempfiles,temp_file_ext);
00416     
00417     pc_message(verbosity,2,
00418                "Writing sorted buffer to temporary file %s\n",
00419                temp_word);
00420 
00421     tempfile = rr_fopen(temp_word,"w");
00422     
00423     for (i=0;i<=n-1;i++) {
00424       sort_ngram[i] = buffer[0].word[i];
00425     }
00426     sort_count = buffer[0].count;
00427     
00428     for (i=1;i<=position_in_buffer-1;i++) {
00429       
00430       same_ngram = 1;
00431       for (j=n-1;j>=0;j--) {
00432         if (buffer[i].word[j] != sort_ngram[j]) {
00433           same_ngram = 0;
00434           j = -1;
00435         }
00436       }
00437       
00438       if (same_ngram) {
00439         sort_count += buffer[i].count;
00440       }
00441       else {
00442         for (j=0;j<=n-1;j++) {
00443           rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00444                     tempfile,"temporary n-gram ids");
00445           sort_ngram[j] = buffer[i].word[j];
00446         }
00447         rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00448                   "temporary n-gram counts");
00449         sort_count = buffer[i].count;
00450       }
00451     }       
00452     for (j=0;j<=n-1;j++) {
00453       rr_fwrite(&sort_ngram[j],sizeof(unsigned short),1,
00454                 tempfile,"temporary n-gram ids");
00455     }
00456     rr_fwrite(&sort_count,sizeof(int),1,tempfile,
00457               "temporary n-gram counts");
00458     fclose(tempfile);
00459     
00460     /* Merge the temporary files, and output the result to standard output */
00461 
00462     fclose(non_unk_fp);
00463 
00464     pc_message(verbosity,2,"Merging temporary files...\n");
00465     
00466     merge_tempfiles(1,
00467                     number_of_tempfiles,
00468                     temp_file_root,
00469                     temp_file_ext,
00470                     max_files,
00471                     tempfiles_directory,
00472                     stdout,
00473                     write_ascii,
00474                     fof_size); 
00475   }
00476 
00477   else {
00478 
00479     /* Just write out the none OOV buffer to stdout */
00480 
00481     fclose(non_unk_fp);
00482 
00483     merge_tempfiles(1,
00484                     1,
00485                     temp_file_root,
00486                     temp_file_ext,
00487                     max_files,
00488                     tempfiles_directory,
00489                     stdout,
00490                     write_ascii,
00491                     fof_size); 
00492   }
00493 
00494   pc_message(verbosity,0,"wngram2idngram : Done.\n");
00495 
00496   exit(0);
00497 
00498 }
00499 

Generated on Tue Dec 21 13:54:46 2004 by doxygen1.2.18