idngram.h Source File

00001 /*=====================================================================
00002                 =======   COPYRIGHT NOTICE   =======
00003 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00004 Ronald Rosenfeld and Philip Clarkson.
00005 
00006 All rights reserved.
00007 
00008 This software is made available for research purposes only.  It may be
00009 redistributed freely for this purpose, in full or in part, provided
00010 that this entire copyright notice is included on any copies of this
00011 software and applications and derivations thereof.
00012 
00013 This software is provided on an "as is" basis, without warranty of any
00014 kind, either expressed or implied, as to any matter including, but not
00015 limited to warranty of fitness of purpose, or merchantability, or
00016 results obtained from use of this software.
00017 ======================================================================*/
00018 
00019 
00024 struct node {
00025   char *word;
00026   unsigned short ind;
00027   struct node *next;
00028 };
00029 
00030 struct hash_table {
00031 
00032   int size;
00033   struct node **chain;
00034 
00035 };
00036 
00037 
00038 int n; /* Declare it globally, so doesn't need to be passed
00039                      as a parameter to compare_ngrams. which might
00040                      cause qsort to choke */
00041 
00042 
00043 int compare_ngrams(const void *ngram1,
00044                    const void *ngram2) {
00045 
00046   int i;
00047 
00048   unsigned short *ngram_pointer1;
00049   unsigned short *ngram_pointer2;
00050 
00051   ngram_pointer1 = (unsigned short *) ngram1;
00052   ngram_pointer2 = (unsigned short *) ngram2;
00053  
00054 
00055   for (i=0;i<=n-1;i++) {
00056     if (ngram_pointer1[i]<ngram_pointer2[i]) {
00057       return(-1);
00058     }
00059     else {
00060       if (ngram_pointer1[i]>ngram_pointer2[i]) {
00061         return(1);
00062       }
00063     }
00064   }
00065 
00066   return(0);
00067 
00068 }
00069 
00070 
00071 
00072 int get_word( FILE *fp , char *word ) {
00073 
00074   /* read word from stream, checking for read errors and EOF */
00075 
00076   int nread;
00077   int rt_val;
00078 
00079   rt_val = 0;
00080 
00081   nread = fscanf( fp,MAX_WORD_LENGTH_FORMAT, word );
00082 
00083   if ( nread == 1 ) {
00084     rt_val = 1;
00085   }
00086   else {
00087     if ( nread != EOF ) {
00088       quit(-1, "Error reading file" );
00089     }
00090   }
00091   return(rt_val);
00092 }
00093 
00094 
00095 void merge_tempfiles (int start_file, 
00096                       int end_file, 
00097                       char *temp_file_root,
00098                       char *temp_file_ext,
00099                       int max_files,
00100                       char *tempfiles_directory, 
00101                       FILE *outfile,
00102                       flag write_ascii,
00103                       int fof_size) {
00104 
00105   FILE *new_temp_file;
00106   char temp_string[1000];
00107   char *new_temp_filename;
00108   
00109   FILE **temp_file;
00110   char **temp_filename;
00111   unsigned short **current_ngram;
00112   int *current_ngram_count;
00113   flag *finished;
00114   flag all_finished;
00115   unsigned short *smallest_ngram;
00116   int temp_count;
00117   int i,j,t;
00118 
00119   flag first_ngram;
00120   int **fof_array;
00121   int *num_kgrams;
00122   unsigned short *previous_ngram;
00123   int *ng_count;
00124   int pos_of_novelty;
00125   
00126   pos_of_novelty = n; /* Simply for warning-free compilation */
00127   num_kgrams = (int *) rr_calloc(n-1,sizeof(int));
00128   ng_count = (int *) rr_calloc(n-1,sizeof(int));
00129   first_ngram = 1;
00130   
00131   previous_ngram = (unsigned short *) rr_calloc(n,sizeof(unsigned short));
00132   temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1));
00133   temp_filename = (char **) rr_malloc(sizeof(char *) * 
00134                                       (end_file-start_file+1));
00135   current_ngram = (unsigned short **) rr_malloc(sizeof(unsigned short *) * 
00136                                                 (end_file-start_file+1));
00137 
00138   for (i=0;i<=end_file-start_file;i++) {
00139     current_ngram[i] = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00140   }
00141 
00142   current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1));
00143   finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1));
00144   smallest_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00145   fof_array = (int **) rr_malloc(sizeof(int *)*(n-1));
00146   for (i=0;i<=n-2;i++) {
00147     fof_array[i] = (int *) rr_calloc(fof_size+1,sizeof(int));
00148   }
00149 
00150 
00151   if (end_file-start_file+1 > max_files) {
00152     sprintf(temp_string,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00153             end_file+1,temp_file_ext);
00154     new_temp_filename = salloc(temp_string);
00155     new_temp_file = rr_oopen(new_temp_filename);
00156     merge_tempfiles(start_file,start_file+max_files-1,
00157                     temp_file_root,temp_file_ext,max_files,
00158                     tempfiles_directory,new_temp_file,write_ascii,0);
00159     merge_tempfiles(start_file+max_files,end_file+1,
00160                     temp_file_root,temp_file_ext,max_files,
00161                     tempfiles_directory,outfile,write_ascii,0);
00162   }
00163   
00164   else {
00165 
00166     /* Open all the temp files for reading */
00167     for (i=0;i<=end_file-start_file;i++) {
00168       sprintf(temp_string,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00169               i+start_file,temp_file_ext);
00170       temp_filename[i] = salloc(temp_string);
00171       temp_file[i] = rr_iopen(temp_filename[i]);
00172     }
00173     
00174     /* Now go through the files simultaneously, and write out the appropriate
00175        ngram counts to the output file. */
00176 
00177     for (i=end_file-start_file;i>=0;i--) {
00178       finished[i] = 0;
00179       if (!rr_feof(temp_file[i])) {
00180         for (j=0;j<=n-1;j++) {
00181           rr_fread(&current_ngram[i][j], sizeof(unsigned short),1,
00182                    temp_file[i],"temporary n-gram ids",0);
00183         }    
00184         rr_fread(&current_ngram_count[i], sizeof(int),1,
00185                  temp_file[i],"temporary n-gram counts",0);
00186       }
00187     }
00188     
00189     all_finished = 0;
00190 
00191     while (!all_finished) {
00192  
00193       /* Find the smallest current ngram */
00194 
00195       for (i=0;i<=n-1;i++) {
00196         smallest_ngram[i] = MAX_VOCAB_SIZE;
00197       }
00198 
00199       for (i=0;i<=end_file-start_file;i++) {
00200         if (!finished[i]) {
00201           if (compare_ngrams(smallest_ngram,current_ngram[i]) > 0) {
00202             for (j=0;j<=n-1;j++) {
00203               smallest_ngram[j] = current_ngram[i][j];
00204             }
00205           }
00206         }
00207       }
00208 
00209       for (i=0;i<=n-1;i++) {
00210         if (smallest_ngram[i] > MAX_VOCAB_SIZE) {
00211           quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n");
00212         }
00213       }
00214           
00215       /* For each of the files that are currently holding this ngram,
00216          add its count to the temporary count, and read in a new ngram
00217          from the files. */
00218 
00219       temp_count = 0;
00220 
00221       for (i=0;i<=end_file-start_file;i++) {
00222         if (!finished[i]) {
00223           if (compare_ngrams(smallest_ngram,current_ngram[i]) == 0) {
00224             temp_count = temp_count + current_ngram_count[i];
00225             if (!rr_feof(temp_file[i])) {
00226               for (j=0;j<=n-1;j++) {
00227                 rr_fread(&current_ngram[i][j],sizeof(unsigned short),1,
00228                          temp_file[i],"temporary n-gram ids",0);
00229               }
00230               rr_fread(&current_ngram_count[i],sizeof(int),1,
00231                        temp_file[i],"temporary n-gram count",0);
00232             }
00233             else {
00234               finished[i] = 1;
00235               all_finished = 1;
00236               for (j=0;j<=end_file-start_file;j++) {
00237                 if (!finished[j]) {
00238                   all_finished = 0;
00239                 }
00240               }
00241             }
00242           }
00243         }
00244       }
00245       
00246       if (write_ascii) {
00247         for (i=0;i<=n-1;i++) {
00248           if (fprintf(outfile,"%hu ",smallest_ngram[i]) < 0) {
00249             quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
00250           }
00251         }
00252         if (fprintf(outfile,"%d\n",temp_count) < 0)  {
00253           quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
00254         }
00255       }
00256       else {
00257         for (i=0;i<=n-1;i++) {
00258           rr_fwrite(&smallest_ngram[i],sizeof(unsigned short),1,
00259                     outfile,"n-gram ids");
00260         }
00261         rr_fwrite(&temp_count,sizeof(int),1,outfile,"n-gram counts");
00262         
00263            
00264       }
00265 
00266       if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */
00267         
00268         /* Code from idngram2stats */
00269         
00270 
00271         pos_of_novelty = n;
00272           
00273         for (i=0;i<=n-1;i++) {
00274           if (smallest_ngram[i] > previous_ngram[i]) {
00275             pos_of_novelty = i;
00276             i=n;
00277           }
00278         }
00279           
00280         /* Add new N-gram */
00281           
00282         num_kgrams[n-2]++;
00283         if (temp_count <= fof_size) {
00284           fof_array[n-2][temp_count]++;
00285         }
00286 
00287         if (!first_ngram) {
00288           for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00289             num_kgrams[i-1]++;
00290             if (ng_count[i-1] <= fof_size) {
00291               fof_array[i-1][ng_count[i-1]]++;
00292             }
00293             ng_count[i-1] = temp_count;
00294           }
00295         }
00296         else {
00297           for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00298             ng_count[i-1] = temp_count;
00299           }
00300           first_ngram = 0;
00301         }
00302           
00303         for (i=0;i<=pos_of_novelty-2;i++) {
00304           ng_count[i] += temp_count;
00305         }
00306         for (i=0;i<=n-1;i++) {
00307           previous_ngram[i]=smallest_ngram[i];
00308         }
00309          
00310       }
00311     }
00312     
00313     for (i=0;i<=end_file-start_file;i++) {
00314       fclose(temp_file[i]);
00315       remove(temp_filename[i]); 
00316     }
00317     
00318   }    
00319 
00320 
00321 
00322   if (fof_size > 0 && n>1) { /* Display fof arrays */
00323 
00324     /* Process last ngram */
00325 
00326     for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00327       num_kgrams[i-1]++;
00328       if (ng_count[i-1] <= fof_size) {
00329         fof_array[i-1][ng_count[i-1]]++;
00330       }
00331       ng_count[i-1] = temp_count;
00332     }
00333     
00334     for (i=0;i<=pos_of_novelty-2;i++) {
00335       ng_count[i] += temp_count;
00336     }
00337 
00338     for (i=0;i<=n-2;i++) {
00339       fprintf(stderr,"\n%d-grams occurring:\tN times\t\t> N times\tSug. -spec_num value\n",i+2);
00340       fprintf(stderr,"%7d\t\t\t\t\t\t%7d\t\t%7d\n",0,num_kgrams[i],((int)(num_kgrams[i]*1.01))+10);
00341       t = num_kgrams[i];
00342       for (j=1;j<=fof_size;j++) {
00343         t -= fof_array[i][j];
00344         fprintf(stderr,"%7d\t\t\t\t%7d\t\t%7d\t\t%7d\n",j,
00345                 fof_array[i][j],t,((int)(t*1.01))+10);
00346       }
00347     }
00348 
00349   }
00350 
00351 }
00352 
00353 
00354 /* Hashing functions, by Gary Cook (gdc@eng.cam.ac.uk).  Could use the
00355    sih functions used in idngrma2lm, but these are much faster. */
00356 
00357 /* return the nearest prime not smaller than 'num' */
00358 int nearest_prime(int num)
00359 {
00360   int div;
00361   int num_has_divisor = 1;
00362   
00363   if ( num / 2 * 2 == num ) num++; 
00364   for (; num_has_divisor; num += 2 ) {
00365      num_has_divisor=0;
00366      for ( div = 3; div <= num / 3; div++ ) {
00367         if ( ( num / div) * div == num ) {
00368            num_has_divisor = 1;
00369            break;
00370         }
00371      }
00372   }
00373   num -= 2;
00374   return( num );
00375 }
00376 
00377 /* generate a hash table address from a variable length character */
00378 /* string - from R. Sedgewick, "Algorithms in C++". */
00379 int hash( char *key, int M )
00380 {
00381   unsigned int h; 
00382   char *t = key;
00383 
00384   for( h = 0; *t; t++ ) {
00385     h = ( 256 * h + *t ) % M;
00386   }
00387   return h;
00388 }
00389 
00390 /* create a new node, and sets the index to ind */
00391 struct node *new_node( char *key ,unsigned short ind)
00392 {
00393   struct node *x;
00394 
00395   x = (struct node *) rr_malloc( sizeof( struct node ) );
00396   x->word = (char *) rr_malloc( (strlen( key ) + 1) * sizeof( char ) );
00397   strcpy( x->word, key );
00398   x->ind = ind;
00399   return x;
00400 }
00401 
00402 /* create hash table */
00403 void new_hashtable( struct hash_table *table, int M )
00404 {
00405   int i;
00406   
00407   table->size = M;
00408   table->chain = (struct node **) rr_malloc( M * sizeof( struct node *) );
00409   for( i = 0; i < M; i++ ) {
00410     table->chain[i] = new_node( "HEAD_NODE" , 0);
00411     table->chain[i]->next = (struct node *) NULL;
00412   }
00413 }
00414 
00415 /* update linked list */
00416 int update_chain( struct node *t, char *key ,unsigned short ind)
00417 {
00418   struct node *x;
00419 
00420   /* Move to end of list */ 
00421 
00422   while( t->next != NULL ) {
00423     t = t->next;
00424   }
00425 
00426   /* add node at end */
00427   x = new_node( key,ind );
00428   x->next = (struct node *) NULL;
00429   t->next = x;
00430   return 0;
00431 }
00432 
00433 void add_to_hashtable( struct hash_table *table,
00434                        unsigned long position,
00435                        char *vocab_item,
00436                        unsigned short ind) {
00437 
00438   update_chain( table->chain[position], vocab_item,ind );
00439 }
00440 
00441 unsigned short index2(struct hash_table *vocab,
00442                       char *word) {
00443   
00444   unsigned long chain;
00445   struct node *chain_pos;
00446 
00447   chain = hash( word, vocab->size );
00448   if ( chain < 0 || chain >= vocab->size ) {
00449     fprintf( stderr, "WARNING : invalid hash address\n" );
00450     fprintf( stderr, "%s ignored\n", word );
00451     return(0);
00452   }
00453 
00454   chain_pos = vocab->chain[chain];
00455   while (chain_pos->next != NULL) {
00456     if (strcmp(word,chain_pos->next->word) ) {
00457       chain_pos = chain_pos->next;
00458     }
00459     else {
00460       return (chain_pos->next->ind);
00461     }
00462   }
00463   return (0);
00464 
00465 }
00466