00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00027 #include <stdio.h>
00028 #include "rr_libs/general.h"
00029 #include "rr_libs/sih.h"
00030 #include "ngram.h"
00031 #include "evallm.h"
00032 #include <string.h>
00033 #include <stdlib.h>
00034 #include "idngram2lm.h"
00035 
00036 #define BBO_FILE_VERSION 970314
00037 
00038 
00039 void load_lm(ng_t *ng,
00040              char *lm_filename) {
00041 
00042   int i;
00043 
00044   ng->bin_fp = rr_iopen(lm_filename);
00045 
00046   rr_fread(&ng->version,sizeof(int),1,ng->bin_fp,"from lm file",0);
00047 
00048   if (ng->version != BBO_FILE_VERSION) {
00049     quit(-1,"Error : Language model file %s appears to be corrupted.\n",
00050          lm_filename);
00051   }
00052 
00053   
00054 
00055   rr_fread(&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n",0);
00056 
00057   rr_fread(&ng->vocab_size,sizeof(unsigned short),1,ng->bin_fp,"vocab_size",0);
00058   rr_fread(&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs",0);
00059 
00060   rr_fread(&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type",0);
00061 
00062   rr_fread(&ng->count_table_size,sizeof(count_ind_t),1,
00063             ng->bin_fp,"count_table_size",0);
00064   rr_fread(&ng->discounting_method,sizeof(unsigned short),1,
00065             ng->bin_fp,"discounting_method",0);
00066  
00067   rr_fread(&ng->min_alpha,sizeof(double),
00068             1,ng->bin_fp,"min_alpha",0);
00069   rr_fread(&ng->max_alpha,sizeof(double),
00070             1,ng->bin_fp,"max_alpha",0);
00071   rr_fread(&ng->out_of_range_alphas,sizeof(unsigned short),
00072             1,ng->bin_fp,"out_of_range_alphas",0);
00073 
00074   rr_fread(&ng->size_of_alpha_array,sizeof(unsigned short),
00075            1,ng->bin_fp,"size_of_alpha_array",0);
00076 
00077 
00078   
00079   rr_fread(&ng->n_unigrams,sizeof(int),1,ng->bin_fp,"n_unigrams",0);
00080   rr_fread(&ng->zeroton_fraction,sizeof(double),1,
00081             ng->bin_fp,"zeroton_fraction",0);
00082 
00083   rr_fread(&ng->oov_fraction,sizeof(double),1,
00084            ng->bin_fp,"oov_fraction",0);
00085   rr_fread(&ng->four_byte_counts,sizeof(flag),1,
00086            ng->bin_fp,"four_byte_counts",0); 
00087   rr_fread(&ng->four_byte_alphas,sizeof(flag),1,
00088            ng->bin_fp,"four_byte_alphas",0);
00089   rr_fread(&ng->first_id,sizeof(unsigned short),1,
00090            ng->bin_fp,"first_id",0);
00091 
00092   ng->vocab_ht = (sih_t *) rr_malloc(sizeof(sih_t));
00093   sih_val_read_from_file(ng->vocab_ht,ng->bin_fp,lm_filename,0);
00094   get_vocab_from_vocab_ht(ng->vocab_ht,ng->vocab_size,0,&ng->vocab);
00095   ng->vocab[0] = salloc("<UNK>");
00096 
00097   if (ng->four_byte_counts) {
00098     ng->marg_counts4 = (int *) 
00099       rr_malloc(sizeof(int)*(ng->vocab_size+1));
00100     rr_fread(ng->marg_counts4,sizeof(int),ng->vocab_size+1,
00101              ng->bin_fp,"marg_counts",0);
00102   } 
00103   else {
00104     ng->marg_counts = (count_ind_t *) 
00105       rr_malloc(sizeof(count_ind_t)*(ng->vocab_size+1));
00106     rr_fread(ng->marg_counts,sizeof(count_ind_t),ng->vocab_size+1,
00107              ng->bin_fp,"marg_counts",0);
00108   }
00109 
00110   ng->alpha_array = (double *) 
00111     rr_malloc(sizeof(double)*(ng->size_of_alpha_array));
00112   rr_fread(ng->alpha_array,sizeof(double),
00113            ng->size_of_alpha_array,ng->bin_fp,"alpha_array",0);
00114 
00115   
00116 
00117   ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);
00118   if (!ng->four_byte_counts) {
00119     for (i=0;i<=ng->n-1;i++) {
00120       ng->count_table[i] = (count_t *) 
00121         rr_malloc(sizeof(count_t)*(ng->count_table_size+1));
00122       rr_fread(ng->count_table[i],sizeof(count_t),
00123                ng->count_table_size+1,ng->bin_fp,"count_table",0);
00124     } 
00125   }
00126 
00127   ng->ptr_table_size = (unsigned short *) 
00128     rr_malloc(sizeof(unsigned short)*ng->n);
00129   rr_fread(ng->ptr_table_size,sizeof(unsigned short),
00130            ng->n,ng->bin_fp,"ptr_table_size",0);
00131 
00132   ng->ptr_table = (ptr_tab_t **) rr_malloc(sizeof(ptr_tab_t *)*ng->n);
00133 
00134   for (i=0;i<=ng->n-1;i++) {
00135     ng->ptr_table[i] = (ptr_tab_t *) 
00136       rr_malloc(sizeof(ptr_tab_t)*ng->ptr_table_size[i]);
00137     rr_fread(ng->ptr_table[i],sizeof(ptr_tab_t),
00138              ng->ptr_table_size[i],ng->bin_fp,"ptr_table",0);
00139   }
00140 
00141   ng->uni_probs = (uni_probs_t *) 
00142     rr_malloc(sizeof(uni_probs_t)*(ng->vocab_size+1));
00143   ng->uni_log_probs = (uni_probs_t *) 
00144     rr_malloc(sizeof(uni_probs_t)*(ng->vocab_size+1));
00145   ng->context_cue = (flag *) 
00146     rr_malloc(sizeof(flag)*(ng->vocab_size+1));
00147 
00148   rr_fread(ng->uni_probs,sizeof(uni_probs_t),ng->vocab_size+1,
00149           ng->bin_fp,"uni_probs",0);
00150   rr_fread(ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1,
00151           ng->bin_fp,"uni_log_probs",0);
00152   rr_fread(ng->context_cue,sizeof(flag),ng->vocab_size+1,
00153           ng->bin_fp,"context_cue",0);
00154 
00155   ng->cutoffs = (cutoff_t *) rr_malloc(sizeof(cutoff_t)*ng->n);
00156   rr_fread(ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs",0);
00157 
00158   switch (ng->discounting_method) {
00159   case GOOD_TURING:
00160     ng->fof_size = (unsigned short *) rr_malloc(sizeof(unsigned short)*ng->n);
00161     ng->disc_range = (unsigned short *) 
00162       rr_malloc(sizeof(unsigned short)*ng->n);
00163     rr_fread(ng->fof_size,sizeof(unsigned short),ng->n,
00164              ng->bin_fp,"fof_size",0);
00165     rr_fread(ng->disc_range,sizeof(unsigned short),ng->n,
00166               ng->bin_fp,"disc_range",0);
00167     ng->freq_of_freq = (int **) rr_malloc(sizeof(int *)*ng->n);
00168     for (i=0;i<=ng->n-1;i++) {
00169       ng->freq_of_freq[i] = (int *) rr_calloc(ng->fof_size[i]+1,sizeof(int));
00170     }
00171     ng->gt_disc_ratio = (disc_val_t **) rr_malloc(sizeof(disc_val_t *)*ng->n);
00172     for (i=0;i<=ng->n-1;i++){
00173       ng->gt_disc_ratio[i] = (disc_val_t *) 
00174         rr_malloc(sizeof(disc_val_t)*(ng->disc_range[i]+1));
00175     }
00176     for (i=0;i<=ng->n-1;i++) {
00177       rr_fread(ng->freq_of_freq[i],sizeof(int),
00178                ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq",0);
00179 
00180     }    
00181     for (i=0;i<=ng->n-1;i++) {
00182       rr_fread(ng->gt_disc_ratio[i],sizeof(disc_val_t),
00183                ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio",0);
00184     }    
00185     break;
00186   case WITTEN_BELL:
00187     break;
00188   case LINEAR:
00189     ng->lin_disc_ratio = (disc_val_t *) rr_malloc(sizeof(disc_val_t)*ng->n);
00190     rr_fread(ng->lin_disc_ratio,sizeof(disc_val_t),ng->n,ng->bin_fp,"lin_disc_ratio",0);
00191     break;
00192   case ABSOLUTE:
00193     ng->abs_disc_const = (double *) rr_malloc(sizeof(double)*ng->n);
00194     rr_fread(ng->abs_disc_const,sizeof(double),ng->n,ng->bin_fp,"abs_disc_const",0);
00195     break;
00196   }
00197 
00198   
00199 
00200   ng->num_kgrams = (int *) rr_malloc(sizeof(int)*ng->n);
00201   rr_fread(ng->num_kgrams,sizeof(int),ng->n,ng->bin_fp,"num_kgrams",0);
00202 
00203   ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
00204   ng->count4 = (int **) rr_malloc(sizeof(int *)*ng->n);
00205 
00206   if (ng->four_byte_counts) {
00207     ng->count4[0] = (int *) rr_malloc(sizeof(int)*(ng->vocab_size+1));
00208     for (i=1;i<=ng->n-1;i++) {
00209       ng->count4[i] = (int *) rr_malloc(sizeof(int)*ng->num_kgrams[i]);
00210     }
00211   }
00212   else {
00213     ng->count[0] = (count_ind_t *) 
00214       rr_malloc(sizeof(count_ind_t)*(ng->vocab_size+1));
00215     for (i=1;i<=ng->n-1;i++) {
00216       ng->count[i] = (count_ind_t *) 
00217         rr_malloc(sizeof(count_ind_t)*ng->num_kgrams[i]);
00218     }
00219   }  
00220   
00221   if (ng->four_byte_alphas) {
00222     ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
00223     ng->bo_weight4[0] = (four_byte_t *) 
00224       rr_malloc(sizeof(four_byte_t)*(ng->vocab_size+1)); 
00225     for (i=1;i<=ng->n-2;i++) {
00226       ng->bo_weight4[i] = (four_byte_t *) 
00227         rr_malloc(sizeof(four_byte_t)*ng->num_kgrams[i]);
00228     }
00229   }
00230  
00231   else {
00232 
00233     ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
00234     ng->bo_weight[0] = (bo_weight_t *)
00235       rr_malloc(sizeof(bo_weight_t)*(ng->vocab_size+1));
00236     for (i=1;i<=ng->n-2;i++) {
00237       ng->bo_weight[i] = (bo_weight_t *) 
00238         rr_malloc(sizeof(bo_weight_t)*ng->num_kgrams[i]);
00239     }
00240   }
00241 
00242   ng->ind = (index__t **) rr_malloc(sizeof(index__t *)*ng->n);
00243   ng->ind[0] = (index__t *)
00244     rr_malloc(sizeof(index__t)*(ng->vocab_size+1));
00245   for (i=1;i<=ng->n-2;i++) {
00246     ng->ind[i] = (index__t *) 
00247       rr_malloc(sizeof(index__t)*ng->num_kgrams[i]);
00248   }
00249   
00250   ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);
00251   for (i=1;i<=ng->n-1;i++) {
00252     ng->word_id[i] = (id__t *) 
00253       rr_malloc(sizeof(id__t)*ng->num_kgrams[i]);
00254   }
00255   
00256   if (ng->four_byte_counts) {
00257     rr_fread(ng->count4[0],sizeof(int),ng->vocab_size+1,
00258              ng->bin_fp,"unigram counts",0); 
00259   }
00260   else {
00261     rr_fread(ng->count[0],sizeof(count_ind_t),ng->vocab_size+1,
00262              ng->bin_fp,"unigram counts",0);
00263   }
00264   if (ng->four_byte_alphas) {
00265     rr_fread(ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1,
00266              ng->bin_fp,"unigram backoff weights",0);
00267   }
00268   else {
00269     rr_fread(ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1,
00270              ng->bin_fp,"unigram backoff weights",0);
00271   }
00272 
00273   if (ng->n > 1) {
00274     rr_fread(ng->ind[0],sizeof(index__t),ng->vocab_size+1,
00275              ng->bin_fp,"unigram -> bigram pointers",0);
00276   }
00277 
00278   for(i=1;i<=ng->n-1;i++) {
00279     rr_fread(ng->word_id[i],sizeof(id__t),ng->num_kgrams[i],
00280              ng->bin_fp,"word ids",0);
00281   }
00282 
00283   if (ng->four_byte_counts) {
00284     for(i=1;i<=ng->n-1;i++) {
00285       rr_fread(ng->count4[i],sizeof(int),ng->num_kgrams[i],
00286                ng->bin_fp,"counts",0);
00287     }
00288   }
00289   else {
00290     for(i=1;i<=ng->n-1;i++) {
00291       rr_fread(ng->count[i],sizeof(count_ind_t),ng->num_kgrams[i],
00292                ng->bin_fp,"counts",0);
00293     }
00294   }
00295 
00296   for(i=1;i<=ng->n-2;i++) {
00297     if (ng->four_byte_alphas) {
00298       rr_fread(ng->bo_weight4[i],sizeof(four_byte_t),ng->num_kgrams[i],
00299                ng->bin_fp,"back off weights",0);
00300     }
00301     else {
00302       rr_fread(ng->bo_weight[i],sizeof(bo_weight_t),ng->num_kgrams[i],
00303                ng->bin_fp,"back off weights",0);
00304     }
00305   }
00306 
00307   for(i=1;i<=ng->n-2;i++) {
00308     rr_fread(ng->ind[i],sizeof(index__t),ng->num_kgrams[i],
00309              ng->bin_fp,"indices",0);
00310   }
00311   
00312   rr_iclose(ng->bin_fp);
00313 
00314 }
00315 
00316 void load_arpa_lm(arpa_lm_t *arpa_lm,
00317                   char *lm_filename) {
00318 
00319   
00320 
00321 
00322   FILE *arpa_fp;
00323   char *in_line;
00324   char *input_line;
00325   char temp_word[15][1024];
00326   int i,j,k;
00327   int num_of_args;
00328   int pos_of_novelty;
00329   char *input_line_ptr_orig;
00330   char *word_copy;
00331   id__t *previous_ngram;
00332   id__t *current_ngram;
00333   int temp_id;
00334   int *pos_in_list;
00335   int previd;
00336   flag first_one;
00337 
00338   in_line = (char *) rr_malloc(1024*sizeof(char));
00339   input_line = (char *) rr_malloc(1024*sizeof(char));
00340 
00341   input_line_ptr_orig = input_line;
00342   
00343   
00344 
00345   
00346 
00347   arpa_fp = rr_iopen(lm_filename);
00348 
00349   
00350   
00351   while (strncmp("\\data\\",in_line,6)) {
00352     if (!rr_feof(arpa_fp)) {
00353       fgets(in_line,1024,arpa_fp);
00354     }
00355     else {
00356       quit(-1,"Error reading arpa language model file. Unexpected end of file.\n");
00357     }
00358   }
00359 
00360   
00361   
00362 
00363   arpa_lm->table_sizes = (int *) rr_malloc(sizeof(int)*11);
00364   arpa_lm->num_kgrams = (int *) rr_malloc(sizeof(int)*11);
00365 
00366   fgets(in_line,1024,arpa_fp);
00367 
00368   i = 0;
00369 
00370   while (strncmp("\\1-grams",in_line,8)) {
00371     if (sscanf(in_line,"%s %s",temp_word[1],temp_word[2]) == 2) {
00372       if (!strcmp("ngram",temp_word[1])) {
00373         i = temp_word[2][0]-48;
00374         arpa_lm->table_sizes[i-1]=atoi(&(temp_word[2][2]));
00375       }
00376     }
00377 
00378     fgets(in_line,1024,arpa_fp);
00379 
00380   }
00381 
00382   if (i==0) {
00383     quit(-1,"Error parsing ARPA format language model.\n");
00384   }
00385 
00386   arpa_lm->n = i;
00387 
00388   previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
00389   current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
00390 
00391   printf("Reading in a %d-gram language model.\n",arpa_lm->n);
00392   for (i=0;i<=arpa_lm->n-1;i++) {
00393     printf("Number of %d-grams = %d.\n",i+1,arpa_lm->table_sizes[i]);
00394     arpa_lm->num_kgrams[i]=arpa_lm->table_sizes[i];
00395   }
00396 
00397   
00398 
00399   pos_in_list = (int *) rr_malloc(sizeof(int) * arpa_lm->n);
00400 
00401   arpa_lm->word_id = (id__t **) rr_malloc(sizeof(id__t *) * arpa_lm->n);
00402   for (i=1;i<=arpa_lm->n-1;i++) { 
00403     arpa_lm->word_id[i] = (id__t *) rr_malloc(sizeof(id__t) * 
00404                                              arpa_lm->table_sizes[i]);
00405   }
00406 
00407   arpa_lm->bo_weight = (bo_t **) rr_malloc(sizeof(bo_t *) * (arpa_lm->n-1));
00408   for (i=0;i<=arpa_lm->n-2;i++) {
00409     arpa_lm->bo_weight[i] = (bo_t *) rr_malloc(sizeof(bo_t) * 
00410                                              arpa_lm->table_sizes[i]);
00411   }
00412 
00413   arpa_lm->ind = (index__t **) rr_malloc(sizeof(index__t *) * (arpa_lm->n-1));
00414   for (i=0;i<=arpa_lm->n-2;i++) {
00415     arpa_lm->ind[i] = (index__t *) rr_malloc(sizeof(index__t) * 
00416                                            arpa_lm->table_sizes[i]);
00417   }
00418 
00419   arpa_lm->probs = (prob_t **) rr_malloc(sizeof(prob_t *) * arpa_lm->n);
00420   for (i=0;i<=arpa_lm->n-1;i++) {
00421     arpa_lm->probs[i] = (prob_t *) rr_malloc(sizeof(prob_t) * 
00422                                              arpa_lm->table_sizes[i]);
00423   }
00424 
00425   arpa_lm->ptr_table = (int **) rr_malloc(sizeof(int *)*arpa_lm->n);
00426   arpa_lm->ptr_table_size = (unsigned short *) 
00427     rr_calloc(arpa_lm->n,sizeof(unsigned short));
00428 
00429   for (i=0;i<=arpa_lm->n-1;i++) {
00430     arpa_lm->ptr_table[i] = (int *) rr_calloc(65535,sizeof(int));
00431   }
00432 
00433   arpa_lm->vocab_ht = sih_create(1000,0.5,2.0,1);
00434   arpa_lm->vocab = (char **) rr_malloc(sizeof(char *)*
00435                                        (arpa_lm->table_sizes[0]+1));
00436   arpa_lm->vocab_size = arpa_lm->table_sizes[0];
00437 
00438   
00439 
00440   printf("Reading unigrams...\n");
00441   
00442   i=0;
00443 
00444   fgets(in_line,1024,arpa_fp);
00445   
00446   if (arpa_lm->n > 1) {
00447 
00448     while (strncmp("\\2-grams",in_line,8)) {
00449       if (sscanf(in_line,"%f %s %f",&arpa_lm->probs[0][i],
00450                  temp_word[1],&arpa_lm->bo_weight[0][i]) == 3) {
00451         word_copy = salloc(temp_word[1]);
00452         
00453         
00454         
00455         if (i==0) {
00456           if (strcmp("<UNK>",word_copy)) {
00457             
00458             
00459             
00460             i++;
00461             arpa_lm->vocab_type = CLOSED_VOCAB;
00462             arpa_lm->first_id = 1;
00463             
00464           }
00465           else {
00466             
00467             
00468             
00469             arpa_lm->vocab_type = OPEN_VOCAB_1;
00470             arpa_lm->first_id = 0;
00471             arpa_lm->vocab_size--;
00472             
00473           }
00474         }
00475         
00476         arpa_lm->vocab[i] = word_copy;
00477         sih_add(arpa_lm->vocab_ht,word_copy,i);
00478         i++;
00479         if (((arpa_lm->vocab_type == OPEN_VOCAB_1) && 
00480              (i>arpa_lm->table_sizes[0])) || 
00481             ((arpa_lm->vocab_type == CLOSED_VOCAB) &&
00482              (i>arpa_lm->table_sizes[0]+1))){
00483           quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d unigrams needed to be stored.\n",arpa_lm->table_sizes[0]);
00484         }
00485       }
00486       else {
00487         if (strlen(in_line)>1) {
00488           fprintf(stderr,"Warning, reading line -%s- gave unexpected input.\n",in_line);
00489         }
00490       }
00491       fgets(in_line,1024,arpa_fp);
00492       
00493     }
00494   }
00495   else {
00496 
00497     while (strncmp("\\end\\",in_line,5)) {
00498       if (sscanf(in_line,"%f %s",&arpa_lm->probs[0][i],
00499                  temp_word[1]) == 2) {
00500         word_copy = salloc(temp_word[1]);
00501         
00502         
00503         
00504         if (i==0) {
00505           if (strcmp("<UNK>",word_copy)) {
00506             
00507             
00508             
00509             i++;
00510             arpa_lm->vocab_type = CLOSED_VOCAB;
00511             arpa_lm->first_id = 1;
00512             
00513           }
00514           else {
00515             
00516             
00517             
00518             arpa_lm->vocab_type = OPEN_VOCAB_1;
00519             arpa_lm->first_id = 0;
00520             arpa_lm->vocab_size--;
00521             
00522           }
00523         }
00524         
00525         arpa_lm->vocab[i] = word_copy;
00526         sih_add(arpa_lm->vocab_ht,word_copy,i);
00527         i++;
00528         if (((arpa_lm->vocab_type == OPEN_VOCAB_1) && 
00529              (i>arpa_lm->table_sizes[0])) || 
00530             ((arpa_lm->vocab_type == CLOSED_VOCAB) &&
00531              (i>arpa_lm->table_sizes[0]+1))){
00532           quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d unigrams needed to be stored.\n",arpa_lm->table_sizes[0]);
00533         }
00534       }
00535       else {
00536         if (strlen(in_line)>1) {
00537           fprintf(stderr,"Warning, reading line -%s- gave unexpected input.\n",in_line);
00538         }
00539       }
00540       fgets(in_line,1024,arpa_fp);
00541       
00542     }
00543   }
00544     
00545   if (arpa_lm->n > 1) {
00546 
00547     
00548 
00549     previd = -1;
00550 
00551     for (i=2;i<=arpa_lm->n-1;i++) {
00552 
00553       printf("\nReading %d-grams...\n",i);
00554 
00555       previd = -1;
00556 
00557       j=0;
00558 
00559       for (k=0;k<=arpa_lm->n-1;k++) {
00560         pos_in_list[k] = 0;
00561       }
00562                                 
00563       sprintf(temp_word[14],"\\%d-grams",i+1);
00564       first_one=1;
00565       while (strncmp(temp_word[14],temp_word[0],8)) {
00566       
00567         
00568 
00569         num_of_args = 0;
00570 
00571         for (k=0;k<=i+1;k++) {
00572           if (strncmp(temp_word[0],temp_word[14],8)) {
00573             fscanf(arpa_fp,"%s",temp_word[k]);
00574           }
00575         }
00576   
00577         if (strncmp(temp_word[0],temp_word[14],8)) {
00578 
00579           arpa_lm->probs[i-1][j] = (prob_t) atof(temp_word[0]);
00580           arpa_lm->bo_weight[i-1][j] = (bo_t) atof(temp_word[i+1]);
00581         
00582           sih_lookup(arpa_lm->vocab_ht,temp_word[i],&temp_id);
00583           arpa_lm->word_id[i-1][j] = temp_id;
00584         
00585           if (j % 20000 == 0) {
00586             if (j % 1000000 == 0) {
00587               if (j != 0) {
00588                 fprintf(stderr,".\n");
00589               }
00590             }
00591             else {
00592               fprintf(stderr,".");
00593             }
00594           }
00595         
00596           j++;
00597           if (j>arpa_lm->table_sizes[i-1]) {
00598             quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i);
00599           }
00600 
00601           
00602 
00603 
00604           for (k=0;k<=i-1;k++) {
00605             previous_ngram[k] = current_ngram[k];
00606             sih_lookup(arpa_lm->vocab_ht,temp_word[k+1],&temp_id);
00607             if (temp_id == 0 && strcmp(temp_word[k+1],"<UNK>")) {
00608               quit(-1,"Error - found unknown word in n-gram file : %s\n",
00609                    temp_word[k+1]);
00610             }
00611             current_ngram[k] = temp_id;
00612           }
00613 
00614           
00615 
00616           if (first_one) {
00617             pos_of_novelty = 0;
00618             first_one = 0;
00619           }
00620           else {
00621       
00622             pos_of_novelty = i;
00623 
00624 
00625             for (k=0;k<=i-1;k++) {
00626               if (current_ngram[k] > previous_ngram[k]) {
00627                 pos_of_novelty = k;
00628                 k = arpa_lm->n;
00629               }
00630               else {
00631                 if ((current_ngram[k] < previous_ngram[k]) && (j > 0)) {
00632                   quit(-1,"Error : n-grams are not correctly ordered.\n");
00633                 }
00634               }
00635             }
00636 
00637             if (pos_of_novelty > i) {
00638               fprintf(stderr,"pos of novelty 2 = %d\n",pos_of_novelty);
00639             }
00640 
00641 
00642             if (pos_of_novelty == i && j != 1) {
00643               quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
00644                    i);
00645             }
00646           }
00647 
00648           
00649 
00650 
00651         
00652           if (pos_of_novelty != i-1) {
00653             if (i==2) {
00654               
00655 
00656               for (k = previd + 1; k <= current_ngram[0]; k++) {
00657                 arpa_lm->ind[0][k] = new_index(j-1,
00658                                                arpa_lm->ptr_table[0],
00659                                                &(arpa_lm->ptr_table_size[0]),
00660                                                k);
00661               }
00662               previd = current_ngram[0];
00663             }
00664             else {
00665 
00666 
00667               
00668 
00669               
00670               for (k=0;k<=i-2;k++) {
00671 
00672                 
00673 
00674                 if (k == 0) {
00675                   pos_in_list[0] = current_ngram[0];
00676                 }
00677                 else {
00678                   pos_in_list[k] = get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
00679                                                   arpa_lm->ptr_table[k-1],   
00680                                                   arpa_lm->ptr_table_size[k-1],   
00681                                                   pos_in_list[k-1]);
00682                   while (arpa_lm->word_id[k][pos_in_list[k]] < 
00683                          current_ngram[k]) {
00684                     pos_in_list[k]++;
00685 
00686                   }
00687                   if (arpa_lm->word_id[k][pos_in_list[k]] != 
00688                       current_ngram[k]) {
00689 
00690                     quit(-1,"Error in the ARPA format language model. \nA %d-gram exists, but not the stem %d-gram.",k+2,k+1);
00691                   }
00692                 }
00693               }
00694               for (k = previd + 1; k <= pos_in_list[i-2]; k++) {
00695                 
00696                 arpa_lm->ind[i-2][k] = 
00697                   new_index(j-1,
00698                             arpa_lm->ptr_table[i-2],
00699                             &(arpa_lm->ptr_table_size[i-2]),
00700                             k);
00701 
00702               }
00703               previd = pos_in_list[i-2];        
00704             }
00705           }
00706         }
00707       }
00708 
00709       
00710 
00711       if (i==2) {
00712 
00713         for (k = previd + 1; k <= arpa_lm->vocab_size; k++) {
00714           
00715           arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1],
00716                                          arpa_lm->ptr_table[0],
00717                                          &(arpa_lm->ptr_table_size[0]),
00718                                          k);
00719         }      
00720       }
00721       else {
00722         for (k = previd + 1; k <= arpa_lm->num_kgrams[i-2]-1;k++) {
00723           arpa_lm->ind[i-2][k] = new_index(j,
00724                                            arpa_lm->ptr_table[i-2],
00725                                            &(arpa_lm->ptr_table_size[i-2]),
00726                                            k);
00727         }
00728       }
00729                                            
00730                                            
00731     }
00732 
00733 
00734 
00735     printf("\nReading %d-grams...\n",arpa_lm->n);
00736     
00737     first_one = 1;
00738     j = 0;
00739     previd = 0;
00740     
00741     arpa_lm->ind[arpa_lm->n-2][0] = 0;
00742 
00743     for (k=0;k<=arpa_lm->n-1;k++) {
00744       pos_in_list[k] = 0;
00745     }
00746   
00747     while (strncmp("\\end\\",temp_word[0],5)) {
00748     
00749       
00750 
00751       for (k=0;k<=arpa_lm->n;k++) {
00752         if (strncmp(temp_word[0],"\\end\\",5)) {
00753           fscanf(arpa_fp,"%s",temp_word[k]);
00754         }
00755       }
00756     
00757       if (strncmp(temp_word[0],"\\end\\",5)) {
00758       
00759         if (j % 20000 == 0) {
00760           if (j % 1000000 == 0) {
00761             if (j != 0) {
00762               fprintf(stderr,".\n");
00763             }
00764           }
00765           else {
00766             fprintf(stderr,".");
00767           }
00768         }
00769       
00770         arpa_lm->probs[arpa_lm->n-1][j] = atof(temp_word[0]);
00771         sih_lookup(arpa_lm->vocab_ht,temp_word[arpa_lm->n],&temp_id);
00772       
00773         arpa_lm->word_id[arpa_lm->n-1][j] = temp_id;
00774       
00775         j++;
00776       
00777         for (k=0;k<=arpa_lm->n-1;k++) {
00778           previous_ngram[k] = current_ngram[k];
00779           sih_lookup(arpa_lm->vocab_ht,temp_word[k+1],&temp_id);
00780           if (temp_id == 0 && strcmp(temp_word[k+1],"<UNK>")) {
00781             quit(-1,"Error - found unknown word in n-gram file : %s\n",
00782                  temp_word[k+1]);
00783           }
00784           current_ngram[k] = temp_id;
00785         }
00786       
00787         
00788         
00789         if (first_one) {
00790           pos_of_novelty = 0;
00791           first_one = 0;
00792         }
00793         else {
00794       
00795           pos_of_novelty = arpa_lm->n+1;
00796 
00797           for (k=0;k<=arpa_lm->n-1;k++) {
00798             if (current_ngram[k] > previous_ngram[k]) {
00799               pos_of_novelty = k;
00800               k = arpa_lm->n;
00801             }
00802             else {
00803               if ((current_ngram[k] < previous_ngram[k]) && (j>0)) {
00804                 quit(-1,"Error : n-grams are not correctly ordered.\n");
00805               }
00806             }
00807           }
00808       
00809           if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) {
00810             quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
00811                  arpa_lm->n);
00812           }
00813         }
00814         if (pos_of_novelty != arpa_lm->n-1) {
00815         
00816           
00817 
00818           for (k=0;k<=arpa_lm->n-2;k++) {
00819 
00820             if (k == 0) {
00821               pos_in_list[0] = current_ngram[0];
00822             }
00823             else {
00824               pos_in_list[k] = get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
00825                                               arpa_lm->ptr_table[k-1],   
00826                                               arpa_lm->ptr_table_size[k-1],   
00827                                               pos_in_list[k-1]);
00828               while (arpa_lm->word_id[k][pos_in_list[k]] < 
00829                      current_ngram[k]) {
00830                 pos_in_list[k]++;
00831               }
00832               
00833               if (arpa_lm->word_id[k][pos_in_list[k]] != current_ngram[k]) {
00834                 quit(-1,"Error in the ARPA format language model. \nA %d-gram exists, but not the stem %d-gram.",k+2,k+1);
00835               }
00836             }
00837           }
00838           for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) {
00839 
00840             arpa_lm->ind[arpa_lm->n-2][k] = 
00841               new_index(j-1,
00842                         arpa_lm->ptr_table[arpa_lm->n-2],
00843                         &(arpa_lm->ptr_table_size[arpa_lm->n-2]),
00844                         k);
00845           }
00846           previd = pos_in_list[arpa_lm->n-2];
00847         }
00848         
00849         if (j>arpa_lm->table_sizes[arpa_lm->n-1]) {
00850           quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1);
00851         }
00852       }
00853     }
00854 
00855     
00856 
00857     for (k = previd + 1; k <= arpa_lm->num_kgrams[arpa_lm->n-2]; k++) {
00858       arpa_lm->ind[arpa_lm->n-2][k] = 
00859         new_index(j,
00860                   arpa_lm->ptr_table[i-2],
00861                   &(arpa_lm->ptr_table_size[i-2]),
00862                   k);
00863     }
00864   
00865   }
00866 
00867   
00868 
00869 
00870   free(previous_ngram);
00871   free(current_ngram);
00872   free(in_line);
00873   free(input_line);
00874   rr_iclose(arpa_fp);
00875 
00876 }
00877 
00878 
00879 
00880 
00881 
00882 
00883 
00884 
00885 
00886