load_lm.c Source File

00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 
00021 /* Must at all times ensure consistency with write_lms.c */
00022 
00027 #include <stdio.h>
00028 #include "rr_libs/general.h"
00029 #include "rr_libs/sih.h"
00030 #include "ngram.h"
00031 #include "evallm.h"
00032 #include <string.h>
00033 #include <stdlib.h>
00034 #include "idngram2lm.h"
00035 
00036 #define BBO_FILE_VERSION 970314
00037 
00038 
00039 void load_lm(ng_t *ng,
00040              char *lm_filename) {
00041 
00042   int i;
00043 
00044   ng->bin_fp = rr_iopen(lm_filename);
00045 
00046   rr_fread(&ng->version,sizeof(int),1,ng->bin_fp,"from lm file",0);
00047 
00048   if (ng->version != BBO_FILE_VERSION) {
00049     quit(-1,"Error : Language model file %s appears to be corrupted.\n",
00050          lm_filename);
00051   }
00052 
00053   /* Scalar parameters */
00054 
00055   rr_fread(&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n",0);
00056 
00057   rr_fread(&ng->vocab_size,sizeof(unsigned short),1,ng->bin_fp,"vocab_size",0);
00058   rr_fread(&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs",0);
00059 
00060   rr_fread(&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type",0);
00061 
00062   rr_fread(&ng->count_table_size,sizeof(count_ind_t),1,
00063             ng->bin_fp,"count_table_size",0);
00064   rr_fread(&ng->discounting_method,sizeof(unsigned short),1,
00065             ng->bin_fp,"discounting_method",0);
00066  
00067   rr_fread(&ng->min_alpha,sizeof(double),
00068             1,ng->bin_fp,"min_alpha",0);
00069   rr_fread(&ng->max_alpha,sizeof(double),
00070             1,ng->bin_fp,"max_alpha",0);
00071   rr_fread(&ng->out_of_range_alphas,sizeof(unsigned short),
00072             1,ng->bin_fp,"out_of_range_alphas",0);
00073 
00074   rr_fread(&ng->size_of_alpha_array,sizeof(unsigned short),
00075            1,ng->bin_fp,"size_of_alpha_array",0);
00076 
00077 
00078   
00079   rr_fread(&ng->n_unigrams,sizeof(int),1,ng->bin_fp,"n_unigrams",0);
00080   rr_fread(&ng->zeroton_fraction,sizeof(double),1,
00081             ng->bin_fp,"zeroton_fraction",0);
00082 
00083   rr_fread(&ng->oov_fraction,sizeof(double),1,
00084            ng->bin_fp,"oov_fraction",0);
00085   rr_fread(&ng->four_byte_counts,sizeof(flag),1,
00086            ng->bin_fp,"four_byte_counts",0); 
00087   rr_fread(&ng->four_byte_alphas,sizeof(flag),1,
00088            ng->bin_fp,"four_byte_alphas",0);
00089   rr_fread(&ng->first_id,sizeof(unsigned short),1,
00090            ng->bin_fp,"first_id",0);
00091 
00092   ng->vocab_ht = (sih_t *) rr_malloc(sizeof(sih_t));
00093   sih_val_read_from_file(ng->vocab_ht,ng->bin_fp,lm_filename,0);
00094   get_vocab_from_vocab_ht(ng->vocab_ht,ng->vocab_size,0,&ng->vocab);
00095   ng->vocab[0] = salloc("<UNK>");
00096 
00097   if (ng->four_byte_counts) {
00098     ng->marg_counts4 = (int *) 
00099       rr_malloc(sizeof(int)*(ng->vocab_size+1));
00100     rr_fread(ng->marg_counts4,sizeof(int),ng->vocab_size+1,
00101              ng->bin_fp,"marg_counts",0);
00102   } 
00103   else {
00104     ng->marg_counts = (count_ind_t *) 
00105       rr_malloc(sizeof(count_ind_t)*(ng->vocab_size+1));
00106     rr_fread(ng->marg_counts,sizeof(count_ind_t),ng->vocab_size+1,
00107              ng->bin_fp,"marg_counts",0);
00108   }
00109 
00110   ng->alpha_array = (double *) 
00111     rr_malloc(sizeof(double)*(ng->size_of_alpha_array));
00112   rr_fread(ng->alpha_array,sizeof(double),
00113            ng->size_of_alpha_array,ng->bin_fp,"alpha_array",0);
00114 
00115   
00116 
00117   ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);
00118   if (!ng->four_byte_counts) {
00119     for (i=0;i<=ng->n-1;i++) {
00120       ng->count_table[i] = (count_t *) 
00121         rr_malloc(sizeof(count_t)*(ng->count_table_size+1));
00122       rr_fread(ng->count_table[i],sizeof(count_t),
00123                ng->count_table_size+1,ng->bin_fp,"count_table",0);
00124     } 
00125   }
00126 
00127   ng->ptr_table_size = (unsigned short *) 
00128     rr_malloc(sizeof(unsigned short)*ng->n);
00129   rr_fread(ng->ptr_table_size,sizeof(unsigned short),
00130            ng->n,ng->bin_fp,"ptr_table_size",0);
00131 
00132   ng->ptr_table = (ptr_tab_t **) rr_malloc(sizeof(ptr_tab_t *)*ng->n);
00133 
00134   for (i=0;i<=ng->n-1;i++) {
00135     ng->ptr_table[i] = (ptr_tab_t *) 
00136       rr_malloc(sizeof(ptr_tab_t)*ng->ptr_table_size[i]);
00137     rr_fread(ng->ptr_table[i],sizeof(ptr_tab_t),
00138              ng->ptr_table_size[i],ng->bin_fp,"ptr_table",0);
00139   }
00140 
00141   ng->uni_probs = (uni_probs_t *) 
00142     rr_malloc(sizeof(uni_probs_t)*(ng->vocab_size+1));
00143   ng->uni_log_probs = (uni_probs_t *) 
00144     rr_malloc(sizeof(uni_probs_t)*(ng->vocab_size+1));
00145   ng->context_cue = (flag *) 
00146     rr_malloc(sizeof(flag)*(ng->vocab_size+1));
00147 
00148   rr_fread(ng->uni_probs,sizeof(uni_probs_t),ng->vocab_size+1,
00149           ng->bin_fp,"uni_probs",0);
00150   rr_fread(ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1,
00151           ng->bin_fp,"uni_log_probs",0);
00152   rr_fread(ng->context_cue,sizeof(flag),ng->vocab_size+1,
00153           ng->bin_fp,"context_cue",0);
00154 
00155   ng->cutoffs = (cutoff_t *) rr_malloc(sizeof(cutoff_t)*ng->n);
00156   rr_fread(ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs",0);
00157 
00158   switch (ng->discounting_method) {
00159   case GOOD_TURING:
00160     ng->fof_size = (unsigned short *) rr_malloc(sizeof(unsigned short)*ng->n);
00161     ng->disc_range = (unsigned short *) 
00162       rr_malloc(sizeof(unsigned short)*ng->n);
00163     rr_fread(ng->fof_size,sizeof(unsigned short),ng->n,
00164              ng->bin_fp,"fof_size",0);
00165     rr_fread(ng->disc_range,sizeof(unsigned short),ng->n,
00166               ng->bin_fp,"disc_range",0);
00167     ng->freq_of_freq = (int **) rr_malloc(sizeof(int *)*ng->n);
00168     for (i=0;i<=ng->n-1;i++) {
00169       ng->freq_of_freq[i] = (int *) rr_calloc(ng->fof_size[i]+1,sizeof(int));
00170     }
00171     ng->gt_disc_ratio = (disc_val_t **) rr_malloc(sizeof(disc_val_t *)*ng->n);
00172     for (i=0;i<=ng->n-1;i++){
00173       ng->gt_disc_ratio[i] = (disc_val_t *) 
00174         rr_malloc(sizeof(disc_val_t)*(ng->disc_range[i]+1));
00175     }
00176     for (i=0;i<=ng->n-1;i++) {
00177       rr_fread(ng->freq_of_freq[i],sizeof(int),
00178                ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq",0);
00179 
00180     }    
00181     for (i=0;i<=ng->n-1;i++) {
00182       rr_fread(ng->gt_disc_ratio[i],sizeof(disc_val_t),
00183                ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio",0);
00184     }    
00185     break;
00186   case WITTEN_BELL:
00187     break;
00188   case LINEAR:
00189     ng->lin_disc_ratio = (disc_val_t *) rr_malloc(sizeof(disc_val_t)*ng->n);
00190     rr_fread(ng->lin_disc_ratio,sizeof(disc_val_t),ng->n,ng->bin_fp,"lin_disc_ratio",0);
00191     break;
00192   case ABSOLUTE:
00193     ng->abs_disc_const = (double *) rr_malloc(sizeof(double)*ng->n);
00194     rr_fread(ng->abs_disc_const,sizeof(double),ng->n,ng->bin_fp,"abs_disc_const",0);
00195     break;
00196   }
00197 
00198   /* Tree information */
00199 
00200   ng->num_kgrams = (int *) rr_malloc(sizeof(int)*ng->n);
00201   rr_fread(ng->num_kgrams,sizeof(int),ng->n,ng->bin_fp,"num_kgrams",0);
00202 
00203   ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
00204   ng->count4 = (int **) rr_malloc(sizeof(int *)*ng->n);
00205 
00206   if (ng->four_byte_counts) {
00207     ng->count4[0] = (int *) rr_malloc(sizeof(int)*(ng->vocab_size+1));
00208     for (i=1;i<=ng->n-1;i++) {
00209       ng->count4[i] = (int *) rr_malloc(sizeof(int)*ng->num_kgrams[i]);
00210     }
00211   }
00212   else {
00213     ng->count[0] = (count_ind_t *) 
00214       rr_malloc(sizeof(count_ind_t)*(ng->vocab_size+1));
00215     for (i=1;i<=ng->n-1;i++) {
00216       ng->count[i] = (count_ind_t *) 
00217         rr_malloc(sizeof(count_ind_t)*ng->num_kgrams[i]);
00218     }
00219   }  
00220   
00221   if (ng->four_byte_alphas) {
00222     ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
00223     ng->bo_weight4[0] = (four_byte_t *) 
00224       rr_malloc(sizeof(four_byte_t)*(ng->vocab_size+1)); 
00225     for (i=1;i<=ng->n-2;i++) {
00226       ng->bo_weight4[i] = (four_byte_t *) 
00227         rr_malloc(sizeof(four_byte_t)*ng->num_kgrams[i]);
00228     }
00229   }
00230  
00231   else {
00232 
00233     ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
00234     ng->bo_weight[0] = (bo_weight_t *)
00235       rr_malloc(sizeof(bo_weight_t)*(ng->vocab_size+1));
00236     for (i=1;i<=ng->n-2;i++) {
00237       ng->bo_weight[i] = (bo_weight_t *) 
00238         rr_malloc(sizeof(bo_weight_t)*ng->num_kgrams[i]);
00239     }
00240   }
00241 
00242   ng->ind = (index__t **) rr_malloc(sizeof(index__t *)*ng->n);
00243   ng->ind[0] = (index__t *)
00244     rr_malloc(sizeof(index__t)*(ng->vocab_size+1));
00245   for (i=1;i<=ng->n-2;i++) {
00246     ng->ind[i] = (index__t *) 
00247       rr_malloc(sizeof(index__t)*ng->num_kgrams[i]);
00248   }
00249   
00250   ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);
00251   for (i=1;i<=ng->n-1;i++) {
00252     ng->word_id[i] = (id__t *) 
00253       rr_malloc(sizeof(id__t)*ng->num_kgrams[i]);
00254   }
00255   
00256   if (ng->four_byte_counts) {
00257     rr_fread(ng->count4[0],sizeof(int),ng->vocab_size+1,
00258              ng->bin_fp,"unigram counts",0); 
00259   }
00260   else {
00261     rr_fread(ng->count[0],sizeof(count_ind_t),ng->vocab_size+1,
00262              ng->bin_fp,"unigram counts",0);
00263   }
00264   if (ng->four_byte_alphas) {
00265     rr_fread(ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1,
00266              ng->bin_fp,"unigram backoff weights",0);
00267   }
00268   else {
00269     rr_fread(ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1,
00270              ng->bin_fp,"unigram backoff weights",0);
00271   }
00272 
00273   if (ng->n > 1) {
00274     rr_fread(ng->ind[0],sizeof(index__t),ng->vocab_size+1,
00275              ng->bin_fp,"unigram -> bigram pointers",0);
00276   }
00277 
00278   for(i=1;i<=ng->n-1;i++) {
00279     rr_fread(ng->word_id[i],sizeof(id__t),ng->num_kgrams[i],
00280              ng->bin_fp,"word ids",0);
00281   }
00282 
00283   if (ng->four_byte_counts) {
00284     for(i=1;i<=ng->n-1;i++) {
00285       rr_fread(ng->count4[i],sizeof(int),ng->num_kgrams[i],
00286                ng->bin_fp,"counts",0);
00287     }
00288   }
00289   else {
00290     for(i=1;i<=ng->n-1;i++) {
00291       rr_fread(ng->count[i],sizeof(count_ind_t),ng->num_kgrams[i],
00292                ng->bin_fp,"counts",0);
00293     }
00294   }
00295 
00296   for(i=1;i<=ng->n-2;i++) {
00297     if (ng->four_byte_alphas) {
00298       rr_fread(ng->bo_weight4[i],sizeof(four_byte_t),ng->num_kgrams[i],
00299                ng->bin_fp,"back off weights",0);
00300     }
00301     else {
00302       rr_fread(ng->bo_weight[i],sizeof(bo_weight_t),ng->num_kgrams[i],
00303                ng->bin_fp,"back off weights",0);
00304     }
00305   }
00306 
00307   for(i=1;i<=ng->n-2;i++) {
00308     rr_fread(ng->ind[i],sizeof(index__t),ng->num_kgrams[i],
00309              ng->bin_fp,"indices",0);
00310   }
00311   
00312   rr_iclose(ng->bin_fp);
00313 
00314 }
00315 
00316 void load_arpa_lm(arpa_lm_t *arpa_lm,
00317                   char *lm_filename) {
00318 
00319   /* Debugged by prc14 9th Nov 1997. Should now work for n-grams with
00320      n>3 with cutoffs >0 */
00321 
00322   FILE *arpa_fp;
00323   char *in_line;
00324   char *input_line;
00325   char temp_word[15][1024];
00326   int i,j,k;
00327   int num_of_args;
00328   int pos_of_novelty;
00329   char *input_line_ptr_orig;
00330   char *word_copy;
00331   id__t *previous_ngram;
00332   id__t *current_ngram;
00333   int temp_id;
00334   int *pos_in_list;
00335   int previd;
00336   flag first_one;
00337 
00338   in_line = (char *) rr_malloc(1024*sizeof(char));
00339   input_line = (char *) rr_malloc(1024*sizeof(char));
00340 
00341   input_line_ptr_orig = input_line;
00342   
00343   /* Attempt to parse an ARPA standard LM in a fairly robust way */
00344 
00345   /* Open file */
00346 
00347   arpa_fp = rr_iopen(lm_filename);
00348 
00349   /* Find start of data marker */
00350   
00351   while (strncmp("\\data\\",in_line,6)) {
00352     if (!rr_feof(arpa_fp)) {
00353       fgets(in_line,1024,arpa_fp);
00354     }
00355     else {
00356       quit(-1,"Error reading arpa language model file. Unexpected end of file.\n");
00357     }
00358   }
00359 
00360   
00361   /* Read number of each k-gram */
00362 
00363   arpa_lm->table_sizes = (int *) rr_malloc(sizeof(int)*11);
00364   arpa_lm->num_kgrams = (int *) rr_malloc(sizeof(int)*11);
00365 
00366   fgets(in_line,1024,arpa_fp);
00367 
00368   i = 0;
00369 
00370   while (strncmp("\\1-grams",in_line,8)) {
00371     if (sscanf(in_line,"%s %s",temp_word[1],temp_word[2]) == 2) {
00372       if (!strcmp("ngram",temp_word[1])) {
00373         i = temp_word[2][0]-48;
00374         arpa_lm->table_sizes[i-1]=atoi(&(temp_word[2][2]));
00375       }
00376     }
00377 
00378     fgets(in_line,1024,arpa_fp);
00379 
00380   }
00381 
00382   if (i==0) {
00383     quit(-1,"Error parsing ARPA format language model.\n");
00384   }
00385 
00386   arpa_lm->n = i;
00387 
00388   previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
00389   current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
00390 
00391   printf("Reading in a %d-gram language model.\n",arpa_lm->n);
00392   for (i=0;i<=arpa_lm->n-1;i++) {
00393     printf("Number of %d-grams = %d.\n",i+1,arpa_lm->table_sizes[i]);
00394     arpa_lm->num_kgrams[i]=arpa_lm->table_sizes[i];
00395   }
00396 
00397   /* Allocate memory */
00398 
00399   pos_in_list = (int *) rr_malloc(sizeof(int) * arpa_lm->n);
00400 
00401   arpa_lm->word_id = (id__t **) rr_malloc(sizeof(id__t *) * arpa_lm->n);
00402   for (i=1;i<=arpa_lm->n-1;i++) { /* Don't allocate for i = 0 */
00403     arpa_lm->word_id[i] = (id__t *) rr_malloc(sizeof(id__t) * 
00404                                              arpa_lm->table_sizes[i]);
00405   }
00406 
00407   arpa_lm->bo_weight = (bo_t **) rr_malloc(sizeof(bo_t *) * (arpa_lm->n-1));
00408   for (i=0;i<=arpa_lm->n-2;i++) {
00409     arpa_lm->bo_weight[i] = (bo_t *) rr_malloc(sizeof(bo_t) * 
00410                                              arpa_lm->table_sizes[i]);
00411   }
00412 
00413   arpa_lm->ind = (index__t **) rr_malloc(sizeof(index__t *) * (arpa_lm->n-1));
00414   for (i=0;i<=arpa_lm->n-2;i++) {
00415     arpa_lm->ind[i] = (index__t *) rr_malloc(sizeof(index__t) * 
00416                                            arpa_lm->table_sizes[i]);
00417   }
00418 
00419   arpa_lm->probs = (prob_t **) rr_malloc(sizeof(prob_t *) * arpa_lm->n);
00420   for (i=0;i<=arpa_lm->n-1;i++) {
00421     arpa_lm->probs[i] = (prob_t *) rr_malloc(sizeof(prob_t) * 
00422                                              arpa_lm->table_sizes[i]);
00423   }
00424 
00425   arpa_lm->ptr_table = (int **) rr_malloc(sizeof(int *)*arpa_lm->n);
00426   arpa_lm->ptr_table_size = (unsigned short *) 
00427     rr_calloc(arpa_lm->n,sizeof(unsigned short));
00428 
00429   for (i=0;i<=arpa_lm->n-1;i++) {
00430     arpa_lm->ptr_table[i] = (int *) rr_calloc(65535,sizeof(int));
00431   }
00432 
00433   arpa_lm->vocab_ht = sih_create(1000,0.5,2.0,1);
00434   arpa_lm->vocab = (char **) rr_malloc(sizeof(char *)*
00435                                        (arpa_lm->table_sizes[0]+1));
00436   arpa_lm->vocab_size = arpa_lm->table_sizes[0];
00437 
00438   /* Process 1-grams */
00439 
00440   printf("Reading unigrams...\n");
00441   
00442   i=0;
00443 
00444   fgets(in_line,1024,arpa_fp);
00445   
00446   if (arpa_lm->n > 1) {
00447 
00448     while (strncmp("\\2-grams",in_line,8)) {
00449       if (sscanf(in_line,"%f %s %f",&arpa_lm->probs[0][i],
00450                  temp_word[1],&arpa_lm->bo_weight[0][i]) == 3) {
00451         word_copy = salloc(temp_word[1]);
00452         
00453         /* Do checks about open or closed vocab */
00454         
00455         if (i==0) {
00456           if (strcmp("<UNK>",word_copy)) {
00457             
00458             /* We have a closed vocabulary model */
00459             
00460             i++;
00461             arpa_lm->vocab_type = CLOSED_VOCAB;
00462             arpa_lm->first_id = 1;
00463             
00464           }
00465           else {
00466             
00467             /* We have an open vocabulary model */
00468             
00469             arpa_lm->vocab_type = OPEN_VOCAB_1;
00470             arpa_lm->first_id = 0;
00471             arpa_lm->vocab_size--;
00472             
00473           }
00474         }
00475         
00476         arpa_lm->vocab[i] = word_copy;
00477         sih_add(arpa_lm->vocab_ht,word_copy,i);
00478         i++;
00479         if (((arpa_lm->vocab_type == OPEN_VOCAB_1) && 
00480              (i>arpa_lm->table_sizes[0])) || 
00481             ((arpa_lm->vocab_type == CLOSED_VOCAB) &&
00482              (i>arpa_lm->table_sizes[0]+1))){
00483           quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d unigrams needed to be stored.\n",arpa_lm->table_sizes[0]);
00484         }
00485       }
00486       else {
00487         if (strlen(in_line)>1) {
00488           fprintf(stderr,"Warning, reading line -%s- gave unexpected input.\n",in_line);
00489         }
00490       }
00491       fgets(in_line,1024,arpa_fp);
00492       
00493     }
00494   }
00495   else {
00496 
00497     while (strncmp("\\end\\",in_line,5)) {
00498       if (sscanf(in_line,"%f %s",&arpa_lm->probs[0][i],
00499                  temp_word[1]) == 2) {
00500         word_copy = salloc(temp_word[1]);
00501         
00502         /* Do checks about open or closed vocab */
00503         
00504         if (i==0) {
00505           if (strcmp("<UNK>",word_copy)) {
00506             
00507             /* We have a closed vocabulary model */
00508             
00509             i++;
00510             arpa_lm->vocab_type = CLOSED_VOCAB;
00511             arpa_lm->first_id = 1;
00512             
00513           }
00514           else {
00515             
00516             /* We have an open vocabulary model */
00517             
00518             arpa_lm->vocab_type = OPEN_VOCAB_1;
00519             arpa_lm->first_id = 0;
00520             arpa_lm->vocab_size--;
00521             
00522           }
00523         }
00524         
00525         arpa_lm->vocab[i] = word_copy;
00526         sih_add(arpa_lm->vocab_ht,word_copy,i);
00527         i++;
00528         if (((arpa_lm->vocab_type == OPEN_VOCAB_1) && 
00529              (i>arpa_lm->table_sizes[0])) || 
00530             ((arpa_lm->vocab_type == CLOSED_VOCAB) &&
00531              (i>arpa_lm->table_sizes[0]+1))){
00532           quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d unigrams needed to be stored.\n",arpa_lm->table_sizes[0]);
00533         }
00534       }
00535       else {
00536         if (strlen(in_line)>1) {
00537           fprintf(stderr,"Warning, reading line -%s- gave unexpected input.\n",in_line);
00538         }
00539       }
00540       fgets(in_line,1024,arpa_fp);
00541       
00542     }
00543   }
00544     
00545   if (arpa_lm->n > 1) {
00546 
00547     /* Process 2, ... , n-1 grams */
00548 
00549     previd = -1;
00550 
00551     for (i=2;i<=arpa_lm->n-1;i++) {
00552 
00553       printf("\nReading %d-grams...\n",i);
00554 
00555       previd = -1;
00556 
00557       j=0;
00558 
00559       for (k=0;k<=arpa_lm->n-1;k++) {
00560         pos_in_list[k] = 0;
00561       }
00562                                 
00563       sprintf(temp_word[14],"\\%d-grams",i+1);
00564       first_one=1;
00565       while (strncmp(temp_word[14],temp_word[0],8)) {
00566       
00567         /* Process line into all relevant temp_words */
00568 
00569         num_of_args = 0;
00570 
00571         for (k=0;k<=i+1;k++) {
00572           if (strncmp(temp_word[0],temp_word[14],8)) {
00573             fscanf(arpa_fp,"%s",temp_word[k]);
00574           }
00575         }
00576   
00577         if (strncmp(temp_word[0],temp_word[14],8)) {
00578 
00579           arpa_lm->probs[i-1][j] = (prob_t) atof(temp_word[0]);
00580           arpa_lm->bo_weight[i-1][j] = (bo_t) atof(temp_word[i+1]);
00581         
00582           sih_lookup(arpa_lm->vocab_ht,temp_word[i],&temp_id);
00583           arpa_lm->word_id[i-1][j] = temp_id;
00584         
00585           if (j % 20000 == 0) {
00586             if (j % 1000000 == 0) {
00587               if (j != 0) {
00588                 fprintf(stderr,".\n");
00589               }
00590             }
00591             else {
00592               fprintf(stderr,".");
00593             }
00594           }
00595         
00596           j++;
00597           if (j>arpa_lm->table_sizes[i-1]) {
00598             quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i);
00599           }
00600 
00601           /* Make sure that indexes in previous table point to 
00602              the right thing. */
00603 
00604           for (k=0;k<=i-1;k++) {
00605             previous_ngram[k] = current_ngram[k];
00606             sih_lookup(arpa_lm->vocab_ht,temp_word[k+1],&temp_id);
00607             if (temp_id == 0 && strcmp(temp_word[k+1],"<UNK>")) {
00608               quit(-1,"Error - found unknown word in n-gram file : %s\n",
00609                    temp_word[k+1]);
00610             }
00611             current_ngram[k] = temp_id;
00612           }
00613 
00614           /* Find position of novelty */
00615 
00616           if (first_one) {
00617             pos_of_novelty = 0;
00618             first_one = 0;
00619           }
00620           else {
00621       
00622             pos_of_novelty = i;
00623 
00624 
00625             for (k=0;k<=i-1;k++) {
00626               if (current_ngram[k] > previous_ngram[k]) {
00627                 pos_of_novelty = k;
00628                 k = arpa_lm->n;
00629               }
00630               else {
00631                 if ((current_ngram[k] < previous_ngram[k]) && (j > 0)) {
00632                   quit(-1,"Error : n-grams are not correctly ordered.\n");
00633                 }
00634               }
00635             }
00636 
00637             if (pos_of_novelty > i) {
00638               fprintf(stderr,"pos of novelty 2 = %d\n",pos_of_novelty);
00639             }
00640 
00641 
00642             if (pos_of_novelty == i && j != 1) {
00643               quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
00644                    i);
00645             }
00646           }
00647 
00648           /* If pos of novelty = i-1 then we are at the same i-1 gram
00649              as before, and so it will be pointing to the right
00650              thing. */
00651         
00652           if (pos_of_novelty != i-1) {
00653             if (i==2) {
00654               /* Deal with unigram pointers */
00655 
00656               for (k = previd + 1; k <= current_ngram[0]; k++) {
00657                 arpa_lm->ind[0][k] = new_index(j-1,
00658                                                arpa_lm->ptr_table[0],
00659                                                &(arpa_lm->ptr_table_size[0]),
00660                                                k);
00661               }
00662               previd = current_ngram[0];
00663             }
00664             else {
00665 
00666 
00667               /* Find the appropriate place in the (i-2) table */
00668 
00669               /*              for (k=pos_of_novelty;k<=i-2;k++) { */
00670               for (k=0;k<=i-2;k++) {
00671 
00672                 /* Find appropriate place in the kth table */
00673 
00674                 if (k == 0) {
00675                   pos_in_list[0] = current_ngram[0];
00676                 }
00677                 else {
00678                   pos_in_list[k] = get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
00679                                                   arpa_lm->ptr_table[k-1],   
00680                                                   arpa_lm->ptr_table_size[k-1],   
00681                                                   pos_in_list[k-1]);
00682                   while (arpa_lm->word_id[k][pos_in_list[k]] < 
00683                          current_ngram[k]) {
00684                     pos_in_list[k]++;
00685 
00686                   }
00687                   if (arpa_lm->word_id[k][pos_in_list[k]] != 
00688                       current_ngram[k]) {
00689 
00690                     quit(-1,"Error in the ARPA format language model. \nA %d-gram exists, but not the stem %d-gram.",k+2,k+1);
00691                   }
00692                 }
00693               }
00694               for (k = previd + 1; k <= pos_in_list[i-2]; k++) {
00695                 
00696                 arpa_lm->ind[i-2][k] = 
00697                   new_index(j-1,
00698                             arpa_lm->ptr_table[i-2],
00699                             &(arpa_lm->ptr_table_size[i-2]),
00700                             k);
00701 
00702               }
00703               previd = pos_in_list[i-2];        
00704             }
00705           }
00706         }
00707       }
00708 
00709       /* Now need to tidy up pointers for bottom section of (i-1)-grams */
00710 
00711       if (i==2) {
00712 
00713         for (k = previd + 1; k <= arpa_lm->vocab_size; k++) {
00714           
00715           arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1],
00716                                          arpa_lm->ptr_table[0],
00717                                          &(arpa_lm->ptr_table_size[0]),
00718                                          k);
00719         }      
00720       }
00721       else {
00722         for (k = previd + 1; k <= arpa_lm->num_kgrams[i-2]-1;k++) {
00723           arpa_lm->ind[i-2][k] = new_index(j,
00724                                            arpa_lm->ptr_table[i-2],
00725                                            &(arpa_lm->ptr_table_size[i-2]),
00726                                            k);
00727         }
00728       }
00729                                            
00730                                            
00731     }
00732 
00733 
00734 
00735     printf("\nReading %d-grams...\n",arpa_lm->n);
00736     
00737     first_one = 1;
00738     j = 0;
00739     previd = 0;
00740     
00741     arpa_lm->ind[arpa_lm->n-2][0] = 0;
00742 
00743     for (k=0;k<=arpa_lm->n-1;k++) {
00744       pos_in_list[k] = 0;
00745     }
00746   
00747     while (strncmp("\\end\\",temp_word[0],5)) {
00748     
00749       /* Process line into all relevant temp_words */
00750 
00751       for (k=0;k<=arpa_lm->n;k++) {
00752         if (strncmp(temp_word[0],"\\end\\",5)) {
00753           fscanf(arpa_fp,"%s",temp_word[k]);
00754         }
00755       }
00756     
00757       if (strncmp(temp_word[0],"\\end\\",5)) {
00758       
00759         if (j % 20000 == 0) {
00760           if (j % 1000000 == 0) {
00761             if (j != 0) {
00762               fprintf(stderr,".\n");
00763             }
00764           }
00765           else {
00766             fprintf(stderr,".");
00767           }
00768         }
00769       
00770         arpa_lm->probs[arpa_lm->n-1][j] = atof(temp_word[0]);
00771         sih_lookup(arpa_lm->vocab_ht,temp_word[arpa_lm->n],&temp_id);
00772       
00773         arpa_lm->word_id[arpa_lm->n-1][j] = temp_id;
00774       
00775         j++;
00776       
00777         for (k=0;k<=arpa_lm->n-1;k++) {
00778           previous_ngram[k] = current_ngram[k];
00779           sih_lookup(arpa_lm->vocab_ht,temp_word[k+1],&temp_id);
00780           if (temp_id == 0 && strcmp(temp_word[k+1],"<UNK>")) {
00781             quit(-1,"Error - found unknown word in n-gram file : %s\n",
00782                  temp_word[k+1]);
00783           }
00784           current_ngram[k] = temp_id;
00785         }
00786       
00787         /* Find position of novelty */
00788         
00789         if (first_one) {
00790           pos_of_novelty = 0;
00791           first_one = 0;
00792         }
00793         else {
00794       
00795           pos_of_novelty = arpa_lm->n+1;
00796 
00797           for (k=0;k<=arpa_lm->n-1;k++) {
00798             if (current_ngram[k] > previous_ngram[k]) {
00799               pos_of_novelty = k;
00800               k = arpa_lm->n;
00801             }
00802             else {
00803               if ((current_ngram[k] < previous_ngram[k]) && (j>0)) {
00804                 quit(-1,"Error : n-grams are not correctly ordered.\n");
00805               }
00806             }
00807           }
00808       
00809           if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) {
00810             quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
00811                  arpa_lm->n);
00812           }
00813         }
00814         if (pos_of_novelty != arpa_lm->n-1) {
00815         
00816           /*      for (k=pos_of_novelty;k<=arpa_lm->n-2;k++) { */
00817 
00818           for (k=0;k<=arpa_lm->n-2;k++) {
00819 
00820             if (k == 0) {
00821               pos_in_list[0] = current_ngram[0];
00822             }
00823             else {
00824               pos_in_list[k] = get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
00825                                               arpa_lm->ptr_table[k-1],   
00826                                               arpa_lm->ptr_table_size[k-1],   
00827                                               pos_in_list[k-1]);
00828               while (arpa_lm->word_id[k][pos_in_list[k]] < 
00829                      current_ngram[k]) {
00830                 pos_in_list[k]++;
00831               }
00832               
00833               if (arpa_lm->word_id[k][pos_in_list[k]] != current_ngram[k]) {
00834                 quit(-1,"Error in the ARPA format language model. \nA %d-gram exists, but not the stem %d-gram.",k+2,k+1);
00835               }
00836             }
00837           }
00838           for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) {
00839 
00840             arpa_lm->ind[arpa_lm->n-2][k] = 
00841               new_index(j-1,
00842                         arpa_lm->ptr_table[arpa_lm->n-2],
00843                         &(arpa_lm->ptr_table_size[arpa_lm->n-2]),
00844                         k);
00845           }
00846           previd = pos_in_list[arpa_lm->n-2];
00847         }
00848         
00849         if (j>arpa_lm->table_sizes[arpa_lm->n-1]) {
00850           quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1);
00851         }
00852       }
00853     }
00854 
00855     /* Tidy up bottom section */
00856 
00857     for (k = previd + 1; k <= arpa_lm->num_kgrams[arpa_lm->n-2]; k++) {
00858       arpa_lm->ind[arpa_lm->n-2][k] = 
00859         new_index(j,
00860                   arpa_lm->ptr_table[i-2],
00861                   &(arpa_lm->ptr_table_size[i-2]),
00862                   k);
00863     }
00864   
00865   }
00866 
00867   /* Tidy up */
00868 
00869 
00870   free(previous_ngram);
00871   free(current_ngram);
00872   free(in_line);
00873   free(input_line);
00874   rr_iclose(arpa_fp);
00875 
00876 }
00877 
00878 
00879 
00880 
00881 
00882 
00883 
00884 
00885 
00886