00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00027 #include <stdio.h>
00028 #include "rr_libs/general.h"
00029 #include "rr_libs/sih.h"
00030 #include "ngram.h"
00031 #include "evallm.h"
00032 #include <string.h>
00033 #include <stdlib.h>
00034 #include "idngram2lm.h"
00035
00036 #define BBO_FILE_VERSION 970314
00037
00038
00039 void load_lm(ng_t *ng,
00040 char *lm_filename) {
00041
00042 int i;
00043
00044 ng->bin_fp = rr_iopen(lm_filename);
00045
00046 rr_fread(&ng->version,sizeof(int),1,ng->bin_fp,"from lm file",0);
00047
00048 if (ng->version != BBO_FILE_VERSION) {
00049 quit(-1,"Error : Language model file %s appears to be corrupted.\n",
00050 lm_filename);
00051 }
00052
00053
00054
00055 rr_fread(&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n",0);
00056
00057 rr_fread(&ng->vocab_size,sizeof(unsigned short),1,ng->bin_fp,"vocab_size",0);
00058 rr_fread(&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs",0);
00059
00060 rr_fread(&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type",0);
00061
00062 rr_fread(&ng->count_table_size,sizeof(count_ind_t),1,
00063 ng->bin_fp,"count_table_size",0);
00064 rr_fread(&ng->discounting_method,sizeof(unsigned short),1,
00065 ng->bin_fp,"discounting_method",0);
00066
00067 rr_fread(&ng->min_alpha,sizeof(double),
00068 1,ng->bin_fp,"min_alpha",0);
00069 rr_fread(&ng->max_alpha,sizeof(double),
00070 1,ng->bin_fp,"max_alpha",0);
00071 rr_fread(&ng->out_of_range_alphas,sizeof(unsigned short),
00072 1,ng->bin_fp,"out_of_range_alphas",0);
00073
00074 rr_fread(&ng->size_of_alpha_array,sizeof(unsigned short),
00075 1,ng->bin_fp,"size_of_alpha_array",0);
00076
00077
00078
00079 rr_fread(&ng->n_unigrams,sizeof(int),1,ng->bin_fp,"n_unigrams",0);
00080 rr_fread(&ng->zeroton_fraction,sizeof(double),1,
00081 ng->bin_fp,"zeroton_fraction",0);
00082
00083 rr_fread(&ng->oov_fraction,sizeof(double),1,
00084 ng->bin_fp,"oov_fraction",0);
00085 rr_fread(&ng->four_byte_counts,sizeof(flag),1,
00086 ng->bin_fp,"four_byte_counts",0);
00087 rr_fread(&ng->four_byte_alphas,sizeof(flag),1,
00088 ng->bin_fp,"four_byte_alphas",0);
00089 rr_fread(&ng->first_id,sizeof(unsigned short),1,
00090 ng->bin_fp,"first_id",0);
00091
00092 ng->vocab_ht = (sih_t *) rr_malloc(sizeof(sih_t));
00093 sih_val_read_from_file(ng->vocab_ht,ng->bin_fp,lm_filename,0);
00094 get_vocab_from_vocab_ht(ng->vocab_ht,ng->vocab_size,0,&ng->vocab);
00095 ng->vocab[0] = salloc("<UNK>");
00096
00097 if (ng->four_byte_counts) {
00098 ng->marg_counts4 = (int *)
00099 rr_malloc(sizeof(int)*(ng->vocab_size+1));
00100 rr_fread(ng->marg_counts4,sizeof(int),ng->vocab_size+1,
00101 ng->bin_fp,"marg_counts",0);
00102 }
00103 else {
00104 ng->marg_counts = (count_ind_t *)
00105 rr_malloc(sizeof(count_ind_t)*(ng->vocab_size+1));
00106 rr_fread(ng->marg_counts,sizeof(count_ind_t),ng->vocab_size+1,
00107 ng->bin_fp,"marg_counts",0);
00108 }
00109
00110 ng->alpha_array = (double *)
00111 rr_malloc(sizeof(double)*(ng->size_of_alpha_array));
00112 rr_fread(ng->alpha_array,sizeof(double),
00113 ng->size_of_alpha_array,ng->bin_fp,"alpha_array",0);
00114
00115
00116
00117 ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);
00118 if (!ng->four_byte_counts) {
00119 for (i=0;i<=ng->n-1;i++) {
00120 ng->count_table[i] = (count_t *)
00121 rr_malloc(sizeof(count_t)*(ng->count_table_size+1));
00122 rr_fread(ng->count_table[i],sizeof(count_t),
00123 ng->count_table_size+1,ng->bin_fp,"count_table",0);
00124 }
00125 }
00126
00127 ng->ptr_table_size = (unsigned short *)
00128 rr_malloc(sizeof(unsigned short)*ng->n);
00129 rr_fread(ng->ptr_table_size,sizeof(unsigned short),
00130 ng->n,ng->bin_fp,"ptr_table_size",0);
00131
00132 ng->ptr_table = (ptr_tab_t **) rr_malloc(sizeof(ptr_tab_t *)*ng->n);
00133
00134 for (i=0;i<=ng->n-1;i++) {
00135 ng->ptr_table[i] = (ptr_tab_t *)
00136 rr_malloc(sizeof(ptr_tab_t)*ng->ptr_table_size[i]);
00137 rr_fread(ng->ptr_table[i],sizeof(ptr_tab_t),
00138 ng->ptr_table_size[i],ng->bin_fp,"ptr_table",0);
00139 }
00140
00141 ng->uni_probs = (uni_probs_t *)
00142 rr_malloc(sizeof(uni_probs_t)*(ng->vocab_size+1));
00143 ng->uni_log_probs = (uni_probs_t *)
00144 rr_malloc(sizeof(uni_probs_t)*(ng->vocab_size+1));
00145 ng->context_cue = (flag *)
00146 rr_malloc(sizeof(flag)*(ng->vocab_size+1));
00147
00148 rr_fread(ng->uni_probs,sizeof(uni_probs_t),ng->vocab_size+1,
00149 ng->bin_fp,"uni_probs",0);
00150 rr_fread(ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1,
00151 ng->bin_fp,"uni_log_probs",0);
00152 rr_fread(ng->context_cue,sizeof(flag),ng->vocab_size+1,
00153 ng->bin_fp,"context_cue",0);
00154
00155 ng->cutoffs = (cutoff_t *) rr_malloc(sizeof(cutoff_t)*ng->n);
00156 rr_fread(ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs",0);
00157
00158 switch (ng->discounting_method) {
00159 case GOOD_TURING:
00160 ng->fof_size = (unsigned short *) rr_malloc(sizeof(unsigned short)*ng->n);
00161 ng->disc_range = (unsigned short *)
00162 rr_malloc(sizeof(unsigned short)*ng->n);
00163 rr_fread(ng->fof_size,sizeof(unsigned short),ng->n,
00164 ng->bin_fp,"fof_size",0);
00165 rr_fread(ng->disc_range,sizeof(unsigned short),ng->n,
00166 ng->bin_fp,"disc_range",0);
00167 ng->freq_of_freq = (int **) rr_malloc(sizeof(int *)*ng->n);
00168 for (i=0;i<=ng->n-1;i++) {
00169 ng->freq_of_freq[i] = (int *) rr_calloc(ng->fof_size[i]+1,sizeof(int));
00170 }
00171 ng->gt_disc_ratio = (disc_val_t **) rr_malloc(sizeof(disc_val_t *)*ng->n);
00172 for (i=0;i<=ng->n-1;i++){
00173 ng->gt_disc_ratio[i] = (disc_val_t *)
00174 rr_malloc(sizeof(disc_val_t)*(ng->disc_range[i]+1));
00175 }
00176 for (i=0;i<=ng->n-1;i++) {
00177 rr_fread(ng->freq_of_freq[i],sizeof(int),
00178 ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq",0);
00179
00180 }
00181 for (i=0;i<=ng->n-1;i++) {
00182 rr_fread(ng->gt_disc_ratio[i],sizeof(disc_val_t),
00183 ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio",0);
00184 }
00185 break;
00186 case WITTEN_BELL:
00187 break;
00188 case LINEAR:
00189 ng->lin_disc_ratio = (disc_val_t *) rr_malloc(sizeof(disc_val_t)*ng->n);
00190 rr_fread(ng->lin_disc_ratio,sizeof(disc_val_t),ng->n,ng->bin_fp,"lin_disc_ratio",0);
00191 break;
00192 case ABSOLUTE:
00193 ng->abs_disc_const = (double *) rr_malloc(sizeof(double)*ng->n);
00194 rr_fread(ng->abs_disc_const,sizeof(double),ng->n,ng->bin_fp,"abs_disc_const",0);
00195 break;
00196 }
00197
00198
00199
00200 ng->num_kgrams = (int *) rr_malloc(sizeof(int)*ng->n);
00201 rr_fread(ng->num_kgrams,sizeof(int),ng->n,ng->bin_fp,"num_kgrams",0);
00202
00203 ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
00204 ng->count4 = (int **) rr_malloc(sizeof(int *)*ng->n);
00205
00206 if (ng->four_byte_counts) {
00207 ng->count4[0] = (int *) rr_malloc(sizeof(int)*(ng->vocab_size+1));
00208 for (i=1;i<=ng->n-1;i++) {
00209 ng->count4[i] = (int *) rr_malloc(sizeof(int)*ng->num_kgrams[i]);
00210 }
00211 }
00212 else {
00213 ng->count[0] = (count_ind_t *)
00214 rr_malloc(sizeof(count_ind_t)*(ng->vocab_size+1));
00215 for (i=1;i<=ng->n-1;i++) {
00216 ng->count[i] = (count_ind_t *)
00217 rr_malloc(sizeof(count_ind_t)*ng->num_kgrams[i]);
00218 }
00219 }
00220
00221 if (ng->four_byte_alphas) {
00222 ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
00223 ng->bo_weight4[0] = (four_byte_t *)
00224 rr_malloc(sizeof(four_byte_t)*(ng->vocab_size+1));
00225 for (i=1;i<=ng->n-2;i++) {
00226 ng->bo_weight4[i] = (four_byte_t *)
00227 rr_malloc(sizeof(four_byte_t)*ng->num_kgrams[i]);
00228 }
00229 }
00230
00231 else {
00232
00233 ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
00234 ng->bo_weight[0] = (bo_weight_t *)
00235 rr_malloc(sizeof(bo_weight_t)*(ng->vocab_size+1));
00236 for (i=1;i<=ng->n-2;i++) {
00237 ng->bo_weight[i] = (bo_weight_t *)
00238 rr_malloc(sizeof(bo_weight_t)*ng->num_kgrams[i]);
00239 }
00240 }
00241
00242 ng->ind = (index__t **) rr_malloc(sizeof(index__t *)*ng->n);
00243 ng->ind[0] = (index__t *)
00244 rr_malloc(sizeof(index__t)*(ng->vocab_size+1));
00245 for (i=1;i<=ng->n-2;i++) {
00246 ng->ind[i] = (index__t *)
00247 rr_malloc(sizeof(index__t)*ng->num_kgrams[i]);
00248 }
00249
00250 ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);
00251 for (i=1;i<=ng->n-1;i++) {
00252 ng->word_id[i] = (id__t *)
00253 rr_malloc(sizeof(id__t)*ng->num_kgrams[i]);
00254 }
00255
00256 if (ng->four_byte_counts) {
00257 rr_fread(ng->count4[0],sizeof(int),ng->vocab_size+1,
00258 ng->bin_fp,"unigram counts",0);
00259 }
00260 else {
00261 rr_fread(ng->count[0],sizeof(count_ind_t),ng->vocab_size+1,
00262 ng->bin_fp,"unigram counts",0);
00263 }
00264 if (ng->four_byte_alphas) {
00265 rr_fread(ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1,
00266 ng->bin_fp,"unigram backoff weights",0);
00267 }
00268 else {
00269 rr_fread(ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1,
00270 ng->bin_fp,"unigram backoff weights",0);
00271 }
00272
00273 if (ng->n > 1) {
00274 rr_fread(ng->ind[0],sizeof(index__t),ng->vocab_size+1,
00275 ng->bin_fp,"unigram -> bigram pointers",0);
00276 }
00277
00278 for(i=1;i<=ng->n-1;i++) {
00279 rr_fread(ng->word_id[i],sizeof(id__t),ng->num_kgrams[i],
00280 ng->bin_fp,"word ids",0);
00281 }
00282
00283 if (ng->four_byte_counts) {
00284 for(i=1;i<=ng->n-1;i++) {
00285 rr_fread(ng->count4[i],sizeof(int),ng->num_kgrams[i],
00286 ng->bin_fp,"counts",0);
00287 }
00288 }
00289 else {
00290 for(i=1;i<=ng->n-1;i++) {
00291 rr_fread(ng->count[i],sizeof(count_ind_t),ng->num_kgrams[i],
00292 ng->bin_fp,"counts",0);
00293 }
00294 }
00295
00296 for(i=1;i<=ng->n-2;i++) {
00297 if (ng->four_byte_alphas) {
00298 rr_fread(ng->bo_weight4[i],sizeof(four_byte_t),ng->num_kgrams[i],
00299 ng->bin_fp,"back off weights",0);
00300 }
00301 else {
00302 rr_fread(ng->bo_weight[i],sizeof(bo_weight_t),ng->num_kgrams[i],
00303 ng->bin_fp,"back off weights",0);
00304 }
00305 }
00306
00307 for(i=1;i<=ng->n-2;i++) {
00308 rr_fread(ng->ind[i],sizeof(index__t),ng->num_kgrams[i],
00309 ng->bin_fp,"indices",0);
00310 }
00311
00312 rr_iclose(ng->bin_fp);
00313
00314 }
00315
00316 void load_arpa_lm(arpa_lm_t *arpa_lm,
00317 char *lm_filename) {
00318
00319
00320
00321
00322 FILE *arpa_fp;
00323 char *in_line;
00324 char *input_line;
00325 char temp_word[15][1024];
00326 int i,j,k;
00327 int num_of_args;
00328 int pos_of_novelty;
00329 char *input_line_ptr_orig;
00330 char *word_copy;
00331 id__t *previous_ngram;
00332 id__t *current_ngram;
00333 int temp_id;
00334 int *pos_in_list;
00335 int previd;
00336 flag first_one;
00337
00338 in_line = (char *) rr_malloc(1024*sizeof(char));
00339 input_line = (char *) rr_malloc(1024*sizeof(char));
00340
00341 input_line_ptr_orig = input_line;
00342
00343
00344
00345
00346
00347 arpa_fp = rr_iopen(lm_filename);
00348
00349
00350
00351 while (strncmp("\\data\\",in_line,6)) {
00352 if (!rr_feof(arpa_fp)) {
00353 fgets(in_line,1024,arpa_fp);
00354 }
00355 else {
00356 quit(-1,"Error reading arpa language model file. Unexpected end of file.\n");
00357 }
00358 }
00359
00360
00361
00362
00363 arpa_lm->table_sizes = (int *) rr_malloc(sizeof(int)*11);
00364 arpa_lm->num_kgrams = (int *) rr_malloc(sizeof(int)*11);
00365
00366 fgets(in_line,1024,arpa_fp);
00367
00368 i = 0;
00369
00370 while (strncmp("\\1-grams",in_line,8)) {
00371 if (sscanf(in_line,"%s %s",temp_word[1],temp_word[2]) == 2) {
00372 if (!strcmp("ngram",temp_word[1])) {
00373 i = temp_word[2][0]-48;
00374 arpa_lm->table_sizes[i-1]=atoi(&(temp_word[2][2]));
00375 }
00376 }
00377
00378 fgets(in_line,1024,arpa_fp);
00379
00380 }
00381
00382 if (i==0) {
00383 quit(-1,"Error parsing ARPA format language model.\n");
00384 }
00385
00386 arpa_lm->n = i;
00387
00388 previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
00389 current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
00390
00391 printf("Reading in a %d-gram language model.\n",arpa_lm->n);
00392 for (i=0;i<=arpa_lm->n-1;i++) {
00393 printf("Number of %d-grams = %d.\n",i+1,arpa_lm->table_sizes[i]);
00394 arpa_lm->num_kgrams[i]=arpa_lm->table_sizes[i];
00395 }
00396
00397
00398
00399 pos_in_list = (int *) rr_malloc(sizeof(int) * arpa_lm->n);
00400
00401 arpa_lm->word_id = (id__t **) rr_malloc(sizeof(id__t *) * arpa_lm->n);
00402 for (i=1;i<=arpa_lm->n-1;i++) {
00403 arpa_lm->word_id[i] = (id__t *) rr_malloc(sizeof(id__t) *
00404 arpa_lm->table_sizes[i]);
00405 }
00406
00407 arpa_lm->bo_weight = (bo_t **) rr_malloc(sizeof(bo_t *) * (arpa_lm->n-1));
00408 for (i=0;i<=arpa_lm->n-2;i++) {
00409 arpa_lm->bo_weight[i] = (bo_t *) rr_malloc(sizeof(bo_t) *
00410 arpa_lm->table_sizes[i]);
00411 }
00412
00413 arpa_lm->ind = (index__t **) rr_malloc(sizeof(index__t *) * (arpa_lm->n-1));
00414 for (i=0;i<=arpa_lm->n-2;i++) {
00415 arpa_lm->ind[i] = (index__t *) rr_malloc(sizeof(index__t) *
00416 arpa_lm->table_sizes[i]);
00417 }
00418
00419 arpa_lm->probs = (prob_t **) rr_malloc(sizeof(prob_t *) * arpa_lm->n);
00420 for (i=0;i<=arpa_lm->n-1;i++) {
00421 arpa_lm->probs[i] = (prob_t *) rr_malloc(sizeof(prob_t) *
00422 arpa_lm->table_sizes[i]);
00423 }
00424
00425 arpa_lm->ptr_table = (int **) rr_malloc(sizeof(int *)*arpa_lm->n);
00426 arpa_lm->ptr_table_size = (unsigned short *)
00427 rr_calloc(arpa_lm->n,sizeof(unsigned short));
00428
00429 for (i=0;i<=arpa_lm->n-1;i++) {
00430 arpa_lm->ptr_table[i] = (int *) rr_calloc(65535,sizeof(int));
00431 }
00432
00433 arpa_lm->vocab_ht = sih_create(1000,0.5,2.0,1);
00434 arpa_lm->vocab = (char **) rr_malloc(sizeof(char *)*
00435 (arpa_lm->table_sizes[0]+1));
00436 arpa_lm->vocab_size = arpa_lm->table_sizes[0];
00437
00438
00439
00440 printf("Reading unigrams...\n");
00441
00442 i=0;
00443
00444 fgets(in_line,1024,arpa_fp);
00445
00446 if (arpa_lm->n > 1) {
00447
00448 while (strncmp("\\2-grams",in_line,8)) {
00449 if (sscanf(in_line,"%f %s %f",&arpa_lm->probs[0][i],
00450 temp_word[1],&arpa_lm->bo_weight[0][i]) == 3) {
00451 word_copy = salloc(temp_word[1]);
00452
00453
00454
00455 if (i==0) {
00456 if (strcmp("<UNK>",word_copy)) {
00457
00458
00459
00460 i++;
00461 arpa_lm->vocab_type = CLOSED_VOCAB;
00462 arpa_lm->first_id = 1;
00463
00464 }
00465 else {
00466
00467
00468
00469 arpa_lm->vocab_type = OPEN_VOCAB_1;
00470 arpa_lm->first_id = 0;
00471 arpa_lm->vocab_size--;
00472
00473 }
00474 }
00475
00476 arpa_lm->vocab[i] = word_copy;
00477 sih_add(arpa_lm->vocab_ht,word_copy,i);
00478 i++;
00479 if (((arpa_lm->vocab_type == OPEN_VOCAB_1) &&
00480 (i>arpa_lm->table_sizes[0])) ||
00481 ((arpa_lm->vocab_type == CLOSED_VOCAB) &&
00482 (i>arpa_lm->table_sizes[0]+1))){
00483 quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d unigrams needed to be stored.\n",arpa_lm->table_sizes[0]);
00484 }
00485 }
00486 else {
00487 if (strlen(in_line)>1) {
00488 fprintf(stderr,"Warning, reading line -%s- gave unexpected input.\n",in_line);
00489 }
00490 }
00491 fgets(in_line,1024,arpa_fp);
00492
00493 }
00494 }
00495 else {
00496
00497 while (strncmp("\\end\\",in_line,5)) {
00498 if (sscanf(in_line,"%f %s",&arpa_lm->probs[0][i],
00499 temp_word[1]) == 2) {
00500 word_copy = salloc(temp_word[1]);
00501
00502
00503
00504 if (i==0) {
00505 if (strcmp("<UNK>",word_copy)) {
00506
00507
00508
00509 i++;
00510 arpa_lm->vocab_type = CLOSED_VOCAB;
00511 arpa_lm->first_id = 1;
00512
00513 }
00514 else {
00515
00516
00517
00518 arpa_lm->vocab_type = OPEN_VOCAB_1;
00519 arpa_lm->first_id = 0;
00520 arpa_lm->vocab_size--;
00521
00522 }
00523 }
00524
00525 arpa_lm->vocab[i] = word_copy;
00526 sih_add(arpa_lm->vocab_ht,word_copy,i);
00527 i++;
00528 if (((arpa_lm->vocab_type == OPEN_VOCAB_1) &&
00529 (i>arpa_lm->table_sizes[0])) ||
00530 ((arpa_lm->vocab_type == CLOSED_VOCAB) &&
00531 (i>arpa_lm->table_sizes[0]+1))){
00532 quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d unigrams needed to be stored.\n",arpa_lm->table_sizes[0]);
00533 }
00534 }
00535 else {
00536 if (strlen(in_line)>1) {
00537 fprintf(stderr,"Warning, reading line -%s- gave unexpected input.\n",in_line);
00538 }
00539 }
00540 fgets(in_line,1024,arpa_fp);
00541
00542 }
00543 }
00544
00545 if (arpa_lm->n > 1) {
00546
00547
00548
00549 previd = -1;
00550
00551 for (i=2;i<=arpa_lm->n-1;i++) {
00552
00553 printf("\nReading %d-grams...\n",i);
00554
00555 previd = -1;
00556
00557 j=0;
00558
00559 for (k=0;k<=arpa_lm->n-1;k++) {
00560 pos_in_list[k] = 0;
00561 }
00562
00563 sprintf(temp_word[14],"\\%d-grams",i+1);
00564 first_one=1;
00565 while (strncmp(temp_word[14],temp_word[0],8)) {
00566
00567
00568
00569 num_of_args = 0;
00570
00571 for (k=0;k<=i+1;k++) {
00572 if (strncmp(temp_word[0],temp_word[14],8)) {
00573 fscanf(arpa_fp,"%s",temp_word[k]);
00574 }
00575 }
00576
00577 if (strncmp(temp_word[0],temp_word[14],8)) {
00578
00579 arpa_lm->probs[i-1][j] = (prob_t) atof(temp_word[0]);
00580 arpa_lm->bo_weight[i-1][j] = (bo_t) atof(temp_word[i+1]);
00581
00582 sih_lookup(arpa_lm->vocab_ht,temp_word[i],&temp_id);
00583 arpa_lm->word_id[i-1][j] = temp_id;
00584
00585 if (j % 20000 == 0) {
00586 if (j % 1000000 == 0) {
00587 if (j != 0) {
00588 fprintf(stderr,".\n");
00589 }
00590 }
00591 else {
00592 fprintf(stderr,".");
00593 }
00594 }
00595
00596 j++;
00597 if (j>arpa_lm->table_sizes[i-1]) {
00598 quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i);
00599 }
00600
00601
00602
00603
00604 for (k=0;k<=i-1;k++) {
00605 previous_ngram[k] = current_ngram[k];
00606 sih_lookup(arpa_lm->vocab_ht,temp_word[k+1],&temp_id);
00607 if (temp_id == 0 && strcmp(temp_word[k+1],"<UNK>")) {
00608 quit(-1,"Error - found unknown word in n-gram file : %s\n",
00609 temp_word[k+1]);
00610 }
00611 current_ngram[k] = temp_id;
00612 }
00613
00614
00615
00616 if (first_one) {
00617 pos_of_novelty = 0;
00618 first_one = 0;
00619 }
00620 else {
00621
00622 pos_of_novelty = i;
00623
00624
00625 for (k=0;k<=i-1;k++) {
00626 if (current_ngram[k] > previous_ngram[k]) {
00627 pos_of_novelty = k;
00628 k = arpa_lm->n;
00629 }
00630 else {
00631 if ((current_ngram[k] < previous_ngram[k]) && (j > 0)) {
00632 quit(-1,"Error : n-grams are not correctly ordered.\n");
00633 }
00634 }
00635 }
00636
00637 if (pos_of_novelty > i) {
00638 fprintf(stderr,"pos of novelty 2 = %d\n",pos_of_novelty);
00639 }
00640
00641
00642 if (pos_of_novelty == i && j != 1) {
00643 quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
00644 i);
00645 }
00646 }
00647
00648
00649
00650
00651
00652 if (pos_of_novelty != i-1) {
00653 if (i==2) {
00654
00655
00656 for (k = previd + 1; k <= current_ngram[0]; k++) {
00657 arpa_lm->ind[0][k] = new_index(j-1,
00658 arpa_lm->ptr_table[0],
00659 &(arpa_lm->ptr_table_size[0]),
00660 k);
00661 }
00662 previd = current_ngram[0];
00663 }
00664 else {
00665
00666
00667
00668
00669
00670 for (k=0;k<=i-2;k++) {
00671
00672
00673
00674 if (k == 0) {
00675 pos_in_list[0] = current_ngram[0];
00676 }
00677 else {
00678 pos_in_list[k] = get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
00679 arpa_lm->ptr_table[k-1],
00680 arpa_lm->ptr_table_size[k-1],
00681 pos_in_list[k-1]);
00682 while (arpa_lm->word_id[k][pos_in_list[k]] <
00683 current_ngram[k]) {
00684 pos_in_list[k]++;
00685
00686 }
00687 if (arpa_lm->word_id[k][pos_in_list[k]] !=
00688 current_ngram[k]) {
00689
00690 quit(-1,"Error in the ARPA format language model. \nA %d-gram exists, but not the stem %d-gram.",k+2,k+1);
00691 }
00692 }
00693 }
00694 for (k = previd + 1; k <= pos_in_list[i-2]; k++) {
00695
00696 arpa_lm->ind[i-2][k] =
00697 new_index(j-1,
00698 arpa_lm->ptr_table[i-2],
00699 &(arpa_lm->ptr_table_size[i-2]),
00700 k);
00701
00702 }
00703 previd = pos_in_list[i-2];
00704 }
00705 }
00706 }
00707 }
00708
00709
00710
00711 if (i==2) {
00712
00713 for (k = previd + 1; k <= arpa_lm->vocab_size; k++) {
00714
00715 arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1],
00716 arpa_lm->ptr_table[0],
00717 &(arpa_lm->ptr_table_size[0]),
00718 k);
00719 }
00720 }
00721 else {
00722 for (k = previd + 1; k <= arpa_lm->num_kgrams[i-2]-1;k++) {
00723 arpa_lm->ind[i-2][k] = new_index(j,
00724 arpa_lm->ptr_table[i-2],
00725 &(arpa_lm->ptr_table_size[i-2]),
00726 k);
00727 }
00728 }
00729
00730
00731 }
00732
00733
00734
00735 printf("\nReading %d-grams...\n",arpa_lm->n);
00736
00737 first_one = 1;
00738 j = 0;
00739 previd = 0;
00740
00741 arpa_lm->ind[arpa_lm->n-2][0] = 0;
00742
00743 for (k=0;k<=arpa_lm->n-1;k++) {
00744 pos_in_list[k] = 0;
00745 }
00746
00747 while (strncmp("\\end\\",temp_word[0],5)) {
00748
00749
00750
00751 for (k=0;k<=arpa_lm->n;k++) {
00752 if (strncmp(temp_word[0],"\\end\\",5)) {
00753 fscanf(arpa_fp,"%s",temp_word[k]);
00754 }
00755 }
00756
00757 if (strncmp(temp_word[0],"\\end\\",5)) {
00758
00759 if (j % 20000 == 0) {
00760 if (j % 1000000 == 0) {
00761 if (j != 0) {
00762 fprintf(stderr,".\n");
00763 }
00764 }
00765 else {
00766 fprintf(stderr,".");
00767 }
00768 }
00769
00770 arpa_lm->probs[arpa_lm->n-1][j] = atof(temp_word[0]);
00771 sih_lookup(arpa_lm->vocab_ht,temp_word[arpa_lm->n],&temp_id);
00772
00773 arpa_lm->word_id[arpa_lm->n-1][j] = temp_id;
00774
00775 j++;
00776
00777 for (k=0;k<=arpa_lm->n-1;k++) {
00778 previous_ngram[k] = current_ngram[k];
00779 sih_lookup(arpa_lm->vocab_ht,temp_word[k+1],&temp_id);
00780 if (temp_id == 0 && strcmp(temp_word[k+1],"<UNK>")) {
00781 quit(-1,"Error - found unknown word in n-gram file : %s\n",
00782 temp_word[k+1]);
00783 }
00784 current_ngram[k] = temp_id;
00785 }
00786
00787
00788
00789 if (first_one) {
00790 pos_of_novelty = 0;
00791 first_one = 0;
00792 }
00793 else {
00794
00795 pos_of_novelty = arpa_lm->n+1;
00796
00797 for (k=0;k<=arpa_lm->n-1;k++) {
00798 if (current_ngram[k] > previous_ngram[k]) {
00799 pos_of_novelty = k;
00800 k = arpa_lm->n;
00801 }
00802 else {
00803 if ((current_ngram[k] < previous_ngram[k]) && (j>0)) {
00804 quit(-1,"Error : n-grams are not correctly ordered.\n");
00805 }
00806 }
00807 }
00808
00809 if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) {
00810 quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
00811 arpa_lm->n);
00812 }
00813 }
00814 if (pos_of_novelty != arpa_lm->n-1) {
00815
00816
00817
00818 for (k=0;k<=arpa_lm->n-2;k++) {
00819
00820 if (k == 0) {
00821 pos_in_list[0] = current_ngram[0];
00822 }
00823 else {
00824 pos_in_list[k] = get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
00825 arpa_lm->ptr_table[k-1],
00826 arpa_lm->ptr_table_size[k-1],
00827 pos_in_list[k-1]);
00828 while (arpa_lm->word_id[k][pos_in_list[k]] <
00829 current_ngram[k]) {
00830 pos_in_list[k]++;
00831 }
00832
00833 if (arpa_lm->word_id[k][pos_in_list[k]] != current_ngram[k]) {
00834 quit(-1,"Error in the ARPA format language model. \nA %d-gram exists, but not the stem %d-gram.",k+2,k+1);
00835 }
00836 }
00837 }
00838 for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) {
00839
00840 arpa_lm->ind[arpa_lm->n-2][k] =
00841 new_index(j-1,
00842 arpa_lm->ptr_table[arpa_lm->n-2],
00843 &(arpa_lm->ptr_table_size[arpa_lm->n-2]),
00844 k);
00845 }
00846 previd = pos_in_list[arpa_lm->n-2];
00847 }
00848
00849 if (j>arpa_lm->table_sizes[arpa_lm->n-1]) {
00850 quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1);
00851 }
00852 }
00853 }
00854
00855
00856
00857 for (k = previd + 1; k <= arpa_lm->num_kgrams[arpa_lm->n-2]; k++) {
00858 arpa_lm->ind[arpa_lm->n-2][k] =
00859 new_index(j,
00860 arpa_lm->ptr_table[i-2],
00861 &(arpa_lm->ptr_table_size[i-2]),
00862 k);
00863 }
00864
00865 }
00866
00867
00868
00869
00870 free(previous_ngram);
00871 free(current_ngram);
00872 free(in_line);
00873 free(input_line);
00874 rr_iclose(arpa_fp);
00875
00876 }
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886