00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00024 struct node {
00025 char *word;
00026 unsigned short ind;
00027 struct node *next;
00028 };
00029
00030 struct hash_table {
00031
00032 int size;
00033 struct node **chain;
00034
00035 };
00036
00037
00038 int n;
00039
00040
00041
00042
00043 int compare_ngrams(const void *ngram1,
00044 const void *ngram2) {
00045
00046 int i;
00047
00048 unsigned short *ngram_pointer1;
00049 unsigned short *ngram_pointer2;
00050
00051 ngram_pointer1 = (unsigned short *) ngram1;
00052 ngram_pointer2 = (unsigned short *) ngram2;
00053
00054
00055 for (i=0;i<=n-1;i++) {
00056 if (ngram_pointer1[i]<ngram_pointer2[i]) {
00057 return(-1);
00058 }
00059 else {
00060 if (ngram_pointer1[i]>ngram_pointer2[i]) {
00061 return(1);
00062 }
00063 }
00064 }
00065
00066 return(0);
00067
00068 }
00069
00070
00071
00072 int get_word( FILE *fp , char *word ) {
00073
00074
00075
00076 int nread;
00077 int rt_val;
00078
00079 rt_val = 0;
00080
00081 nread = fscanf( fp,MAX_WORD_LENGTH_FORMAT, word );
00082
00083 if ( nread == 1 ) {
00084 rt_val = 1;
00085 }
00086 else {
00087 if ( nread != EOF ) {
00088 quit(-1, "Error reading file" );
00089 }
00090 }
00091 return(rt_val);
00092 }
00093
00094
00095 void merge_tempfiles (int start_file,
00096 int end_file,
00097 char *temp_file_root,
00098 char *temp_file_ext,
00099 int max_files,
00100 char *tempfiles_directory,
00101 FILE *outfile,
00102 flag write_ascii,
00103 int fof_size) {
00104
00105 FILE *new_temp_file;
00106 char temp_string[1000];
00107 char *new_temp_filename;
00108
00109 FILE **temp_file;
00110 char **temp_filename;
00111 unsigned short **current_ngram;
00112 int *current_ngram_count;
00113 flag *finished;
00114 flag all_finished;
00115 unsigned short *smallest_ngram;
00116 int temp_count;
00117 int i,j,t;
00118
00119 flag first_ngram;
00120 int **fof_array;
00121 int *num_kgrams;
00122 unsigned short *previous_ngram;
00123 int *ng_count;
00124 int pos_of_novelty;
00125
00126 pos_of_novelty = n;
00127 num_kgrams = (int *) rr_calloc(n-1,sizeof(int));
00128 ng_count = (int *) rr_calloc(n-1,sizeof(int));
00129 first_ngram = 1;
00130
00131 previous_ngram = (unsigned short *) rr_calloc(n,sizeof(unsigned short));
00132 temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1));
00133 temp_filename = (char **) rr_malloc(sizeof(char *) *
00134 (end_file-start_file+1));
00135 current_ngram = (unsigned short **) rr_malloc(sizeof(unsigned short *) *
00136 (end_file-start_file+1));
00137
00138 for (i=0;i<=end_file-start_file;i++) {
00139 current_ngram[i] = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00140 }
00141
00142 current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1));
00143 finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1));
00144 smallest_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00145 fof_array = (int **) rr_malloc(sizeof(int *)*(n-1));
00146 for (i=0;i<=n-2;i++) {
00147 fof_array[i] = (int *) rr_calloc(fof_size+1,sizeof(int));
00148 }
00149
00150
00151 if (end_file-start_file+1 > max_files) {
00152 sprintf(temp_string,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00153 end_file+1,temp_file_ext);
00154 new_temp_filename = salloc(temp_string);
00155 new_temp_file = rr_oopen(new_temp_filename);
00156 merge_tempfiles(start_file,start_file+max_files-1,
00157 temp_file_root,temp_file_ext,max_files,
00158 tempfiles_directory,new_temp_file,write_ascii,0);
00159 merge_tempfiles(start_file+max_files,end_file+1,
00160 temp_file_root,temp_file_ext,max_files,
00161 tempfiles_directory,outfile,write_ascii,0);
00162 }
00163
00164 else {
00165
00166
00167 for (i=0;i<=end_file-start_file;i++) {
00168 sprintf(temp_string,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00169 i+start_file,temp_file_ext);
00170 temp_filename[i] = salloc(temp_string);
00171 temp_file[i] = rr_iopen(temp_filename[i]);
00172 }
00173
00174
00175
00176
00177 for (i=end_file-start_file;i>=0;i--) {
00178 finished[i] = 0;
00179 if (!rr_feof(temp_file[i])) {
00180 for (j=0;j<=n-1;j++) {
00181 rr_fread(¤t_ngram[i][j], sizeof(unsigned short),1,
00182 temp_file[i],"temporary n-gram ids",0);
00183 }
00184 rr_fread(¤t_ngram_count[i], sizeof(int),1,
00185 temp_file[i],"temporary n-gram counts",0);
00186 }
00187 }
00188
00189 all_finished = 0;
00190
00191 while (!all_finished) {
00192
00193
00194
00195 for (i=0;i<=n-1;i++) {
00196 smallest_ngram[i] = MAX_VOCAB_SIZE;
00197 }
00198
00199 for (i=0;i<=end_file-start_file;i++) {
00200 if (!finished[i]) {
00201 if (compare_ngrams(smallest_ngram,current_ngram[i]) > 0) {
00202 for (j=0;j<=n-1;j++) {
00203 smallest_ngram[j] = current_ngram[i][j];
00204 }
00205 }
00206 }
00207 }
00208
00209 for (i=0;i<=n-1;i++) {
00210 if (smallest_ngram[i] > MAX_VOCAB_SIZE) {
00211 quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n");
00212 }
00213 }
00214
00215
00216
00217
00218
00219 temp_count = 0;
00220
00221 for (i=0;i<=end_file-start_file;i++) {
00222 if (!finished[i]) {
00223 if (compare_ngrams(smallest_ngram,current_ngram[i]) == 0) {
00224 temp_count = temp_count + current_ngram_count[i];
00225 if (!rr_feof(temp_file[i])) {
00226 for (j=0;j<=n-1;j++) {
00227 rr_fread(¤t_ngram[i][j],sizeof(unsigned short),1,
00228 temp_file[i],"temporary n-gram ids",0);
00229 }
00230 rr_fread(¤t_ngram_count[i],sizeof(int),1,
00231 temp_file[i],"temporary n-gram count",0);
00232 }
00233 else {
00234 finished[i] = 1;
00235 all_finished = 1;
00236 for (j=0;j<=end_file-start_file;j++) {
00237 if (!finished[j]) {
00238 all_finished = 0;
00239 }
00240 }
00241 }
00242 }
00243 }
00244 }
00245
00246 if (write_ascii) {
00247 for (i=0;i<=n-1;i++) {
00248 if (fprintf(outfile,"%hu ",smallest_ngram[i]) < 0) {
00249 quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
00250 }
00251 }
00252 if (fprintf(outfile,"%d\n",temp_count) < 0) {
00253 quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
00254 }
00255 }
00256 else {
00257 for (i=0;i<=n-1;i++) {
00258 rr_fwrite(&smallest_ngram[i],sizeof(unsigned short),1,
00259 outfile,"n-gram ids");
00260 }
00261 rr_fwrite(&temp_count,sizeof(int),1,outfile,"n-gram counts");
00262
00263
00264 }
00265
00266 if (fof_size > 0 && n>1) {
00267
00268
00269
00270
00271 pos_of_novelty = n;
00272
00273 for (i=0;i<=n-1;i++) {
00274 if (smallest_ngram[i] > previous_ngram[i]) {
00275 pos_of_novelty = i;
00276 i=n;
00277 }
00278 }
00279
00280
00281
00282 num_kgrams[n-2]++;
00283 if (temp_count <= fof_size) {
00284 fof_array[n-2][temp_count]++;
00285 }
00286
00287 if (!first_ngram) {
00288 for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00289 num_kgrams[i-1]++;
00290 if (ng_count[i-1] <= fof_size) {
00291 fof_array[i-1][ng_count[i-1]]++;
00292 }
00293 ng_count[i-1] = temp_count;
00294 }
00295 }
00296 else {
00297 for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00298 ng_count[i-1] = temp_count;
00299 }
00300 first_ngram = 0;
00301 }
00302
00303 for (i=0;i<=pos_of_novelty-2;i++) {
00304 ng_count[i] += temp_count;
00305 }
00306 for (i=0;i<=n-1;i++) {
00307 previous_ngram[i]=smallest_ngram[i];
00308 }
00309
00310 }
00311 }
00312
00313 for (i=0;i<=end_file-start_file;i++) {
00314 fclose(temp_file[i]);
00315 remove(temp_filename[i]);
00316 }
00317
00318 }
00319
00320
00321
00322 if (fof_size > 0 && n>1) {
00323
00324
00325
00326 for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00327 num_kgrams[i-1]++;
00328 if (ng_count[i-1] <= fof_size) {
00329 fof_array[i-1][ng_count[i-1]]++;
00330 }
00331 ng_count[i-1] = temp_count;
00332 }
00333
00334 for (i=0;i<=pos_of_novelty-2;i++) {
00335 ng_count[i] += temp_count;
00336 }
00337
00338 for (i=0;i<=n-2;i++) {
00339 fprintf(stderr,"\n%d-grams occurring:\tN times\t\t> N times\tSug. -spec_num value\n",i+2);
00340 fprintf(stderr,"%7d\t\t\t\t\t\t%7d\t\t%7d\n",0,num_kgrams[i],((int)(num_kgrams[i]*1.01))+10);
00341 t = num_kgrams[i];
00342 for (j=1;j<=fof_size;j++) {
00343 t -= fof_array[i][j];
00344 fprintf(stderr,"%7d\t\t\t\t%7d\t\t%7d\t\t%7d\n",j,
00345 fof_array[i][j],t,((int)(t*1.01))+10);
00346 }
00347 }
00348
00349 }
00350
00351 }
00352
00353
00354
00355
00356
00357
00358 int nearest_prime(int num)
00359 {
00360 int div;
00361 int num_has_divisor = 1;
00362
00363 if ( num / 2 * 2 == num ) num++;
00364 for (; num_has_divisor; num += 2 ) {
00365 num_has_divisor=0;
00366 for ( div = 3; div <= num / 3; div++ ) {
00367 if ( ( num / div) * div == num ) {
00368 num_has_divisor = 1;
00369 break;
00370 }
00371 }
00372 }
00373 num -= 2;
00374 return( num );
00375 }
00376
00377
00378
00379 int hash( char *key, int M )
00380 {
00381 unsigned int h;
00382 char *t = key;
00383
00384 for( h = 0; *t; t++ ) {
00385 h = ( 256 * h + *t ) % M;
00386 }
00387 return h;
00388 }
00389
00390
00391 struct node *new_node( char *key ,unsigned short ind)
00392 {
00393 struct node *x;
00394
00395 x = (struct node *) rr_malloc( sizeof( struct node ) );
00396 x->word = (char *) rr_malloc( (strlen( key ) + 1) * sizeof( char ) );
00397 strcpy( x->word, key );
00398 x->ind = ind;
00399 return x;
00400 }
00401
00402
00403 void new_hashtable( struct hash_table *table, int M )
00404 {
00405 int i;
00406
00407 table->size = M;
00408 table->chain = (struct node **) rr_malloc( M * sizeof( struct node *) );
00409 for( i = 0; i < M; i++ ) {
00410 table->chain[i] = new_node( "HEAD_NODE" , 0);
00411 table->chain[i]->next = (struct node *) NULL;
00412 }
00413 }
00414
00415
00416 int update_chain( struct node *t, char *key ,unsigned short ind)
00417 {
00418 struct node *x;
00419
00420
00421
00422 while( t->next != NULL ) {
00423 t = t->next;
00424 }
00425
00426
00427 x = new_node( key,ind );
00428 x->next = (struct node *) NULL;
00429 t->next = x;
00430 return 0;
00431 }
00432
00433 void add_to_hashtable( struct hash_table *table,
00434 unsigned long position,
00435 char *vocab_item,
00436 unsigned short ind) {
00437
00438 update_chain( table->chain[position], vocab_item,ind );
00439 }
00440
00441 unsigned short index2(struct hash_table *vocab,
00442 char *word) {
00443
00444 unsigned long chain;
00445 struct node *chain_pos;
00446
00447 chain = hash( word, vocab->size );
00448 if ( chain < 0 || chain >= vocab->size ) {
00449 fprintf( stderr, "WARNING : invalid hash address\n" );
00450 fprintf( stderr, "%s ignored\n", word );
00451 return(0);
00452 }
00453
00454 chain_pos = vocab->chain[chain];
00455 while (chain_pos->next != NULL) {
00456 if (strcmp(word,chain_pos->next->word) ) {
00457 chain_pos = chain_pos->next;
00458 }
00459 else {
00460 return (chain_pos->next->ind);
00461 }
00462 }
00463 return (0);
00464
00465 }
00466