00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "toolkit.h"
00021 #include "ngram.h"
00022 #include "pc_libs/pc_general.h"
00023 #include "idngram2lm.h"
00024 #include "rr_libs/sih.h"
00025 #include "rr_libs/general.h"
00026 #include <stdio.h>
00027 #include <stdlib.h>
00028 #include <string.h>
00029
00035 void main (int argc, char **argv) {
00036
00037 int i,j;
00038 ng_t ng;
00039 int verbosity;
00040 int mem_alloc_method;
00041
00042 int buffer_size;
00043 flag is_ascii;
00044 flag ascii_flag;
00045 flag bin_flag;
00046 char current_cc[200];
00047 int current_cc_id;
00048 int test_cc_id;
00049 ngram current_ngram;
00050 ngram previous_ngram;
00051 int *ng_count;
00052
00053 int nlines;
00054 int pos_of_novelty;
00055 int prev_id1;
00056 flag contains_unks;
00057 int mem_alloced;
00058 char wlist_entry[1024];
00059 int end_size;
00060 int middle_size;
00061
00062 flag displayed_oov_warning;
00063
00064 flag context_set;
00065 flag oov_frac_set;
00066 flag disc_range_set;
00067
00068
00069
00070 report_version(&argc,argv);
00071
00072 if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
00073
00074
00075
00076 fprintf(stderr,"idngram2lm : Convert an idngram file to a language model file.\n");
00077 fprintf(stderr,"Usage : \n");
00078 fprintf(stderr,"idngram2lm -idngram .idngram\n");
00079 fprintf(stderr," -vocab .vocab\n");
00080 fprintf(stderr," -arpa .arpa | -binary .binlm\n");
00081 fprintf(stderr," [ -context .ccs ]\n");
00082 fprintf(stderr," [ -calc_mem | -buffer 100 | -spec_num y ... z ]\n");
00083 fprintf(stderr," [ -vocab_type 1 ]\n");
00084 fprintf(stderr," [ -oov_fraction 0.5 ]\n");
00085 fprintf(stderr," [ -two_byte_bo_weights \n [ -min_bo_weight nnnnn] [ -max_bo_weight nnnnn] [ -out_of_range_bo_weights] ]\n");
00086 fprintf(stderr," [ -four_byte_counts ]\n");
00087 fprintf(stderr," [ -linear | -absolute | -good_turing | -witten_bell ]\n");
00088 fprintf(stderr," [ -disc_ranges 1 7 7 ]\n");
00089 fprintf(stderr," [ -cutoffs 0 ... 0 ]\n");
00090 fprintf(stderr," [ -min_unicount 0 ]\n");
00091 fprintf(stderr," [ -zeroton_fraction ]\n");
00092 fprintf(stderr," [ -ascii_input | -bin_input ]\n");
00093 fprintf(stderr," [ -n 3 ] \n");
00094 fprintf(stderr," [ -verbosity %d ]\n",DEFAULT_VERBOSITY);
00095 exit(1);
00096 }
00097
00098 verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
00099 ng.n = pc_intarg(&argc, argv,"-n",DEFAULT_N);
00100
00101 if (ng.n<1) {
00102 quit(-1,"Error: Value of n must be positive.\n");
00103 }
00104
00105 ng.cutoffs = (cutoff_t *) pc_shortarrayarg(&argc, argv, "-cutoffs",ng.n-1,ng.n-1);
00106
00107 if (ng.cutoffs == NULL) {
00108 ng.cutoffs = (cutoff_t *) rr_calloc(ng.n-1,sizeof(cutoff_t));
00109 }
00110
00111 for (i=0;i<=ng.n-3;i++) {
00112 if (ng.cutoffs[i+1] < ng.cutoffs[i]) {
00113 quit(-1,"Error - cutoffs for (n+1)-gram must be greater than or equal to those for \nn-gram. You have %d-gram cutoff = %d > %d-gram cutoff = %d.\n",i+2,ng.cutoffs[i],i+3,ng.cutoffs[i+1]);
00114 }
00115 }
00116
00117 mem_alloc_method = 0;
00118
00119 if (pc_flagarg(&argc, argv,"-calc_mem")) {
00120 mem_alloc_method = TWO_PASSES;
00121 }
00122
00123 buffer_size = pc_intarg(&argc, argv,"-buffer",-1);
00124
00125 if (buffer_size != -1) {
00126 if (mem_alloc_method != 0) {
00127 quit(-1,"Assigned two contradictory methods of memory allocation.\n Use one of -calc_mem, -buffer, or -spec_num.\n");
00128 }
00129 mem_alloc_method = BUFFER;
00130 }
00131
00132 ng.table_sizes = pc_intarrayarg(&argc, argv, "-spec_num",ng.n-1,ng.n);
00133
00134 if (ng.table_sizes != NULL) {
00135
00136 if (mem_alloc_method != 0) {
00137 quit(-1,"Assigned two contradictory methods of memory allocation.\n Use one of -calc_mem, -guess, or -spec_num.\n");
00138 }
00139 mem_alloc_method = SPECIFIED;
00140 for (i=ng.n-1;i>=1;i--) {
00141 ng.table_sizes[i] = ng.table_sizes[i-1];
00142 }
00143 }
00144
00145 if (mem_alloc_method == 0) {
00146 mem_alloc_method = BUFFER;
00147 buffer_size = STD_MEM;
00148 }
00149
00150 ng.min_unicount = pc_intarg(&argc, argv, "-min_unicount",0);
00151
00152 ng.id_gram_filename = salloc(pc_stringarg(&argc, argv,"-idngram",""));
00153
00154 if (!strcmp(ng.id_gram_filename,"")) {
00155 quit(-1,"Error: id ngram file not specified. Use the -idngram flag.\n");
00156 }
00157
00158 if (!strcmp(ng.id_gram_filename,"-") && mem_alloc_method == TWO_PASSES) {
00159 quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
00160 }
00161
00162 is_ascii = 0;
00163
00164 ascii_flag = pc_flagarg(&argc,argv,"-ascii_input");
00165 bin_flag = pc_flagarg(&argc,argv,"-bin_input");
00166
00167 if (ascii_flag ||
00168 !strcmp(&ng.id_gram_filename[strlen(ng.id_gram_filename)-6],".ascii")) {
00169 is_ascii = 1;
00170 }
00171 else {
00172 }
00173
00174 if (ascii_flag) {
00175
00176 if (bin_flag) {
00177 quit(-1,"Error : Specify only one of -bin_input and -ascii_input\n");
00178 }
00179
00180 if (!strcmp(&ng.id_gram_filename[strlen(ng.id_gram_filename)-4],".bin")) {
00181 quit(-1,"Error : -ascii_input flag specified, but input file has .bin extention.\n");
00182 }
00183
00184 }
00185
00186 if (bin_flag &&
00187 !strcmp(&ng.id_gram_filename[strlen(ng.id_gram_filename)-6],".ascii") ) {
00188 quit(-1,"Error : -bin_input flag specified, but input file has .ascii extention.\n");
00189 }
00190
00191 ng.arpa_filename = salloc(pc_stringarg(&argc, argv,"-arpa",""));
00192 ng.bin_filename = salloc(pc_stringarg(&argc, argv,"-binary",""));
00193
00194 ng.write_arpa = strcmp("",ng.arpa_filename);
00195 ng.write_bin = strcmp("",ng.bin_filename);
00196
00197 if (!(ng.write_arpa || ng.write_bin)) {
00198 quit(-1,"Error : must specify either an arpa, or a binary output file.\n");
00199 }
00200
00201 ng.count_table_size = DEFAULT_COUNT_TABLE_SIZE;
00202
00203 ng.vocab_filename = salloc(pc_stringarg(&argc,argv,"-vocab",""));
00204
00205
00206
00207 if (!strcmp("",ng.vocab_filename)) {
00208 quit(-1,"Error : vocabulary file not specified. Use the -vocab option.\n");
00209 }
00210
00211
00212 ng.context_cues_filename = salloc(pc_stringarg(&argc,argv,"-context",""));
00213
00214 context_set = strcmp("", ng.context_cues_filename);
00215
00216 ng.vocab_type = pc_intarg(&argc,argv,"-vocab_type",1);
00217
00218 ng.oov_fraction = pc_doublearg(&argc, argv,"-oov_fraction",-1.0);
00219
00220 if (ng.oov_fraction == -1.0) {
00221 oov_frac_set = 0;
00222 ng.oov_fraction=DEFAULT_OOV_FRACTION;
00223 }
00224 else {
00225 oov_frac_set = 1;
00226 if (ng.vocab_type != 2) {
00227 pc_message(verbosity,1,"Warning : OOV fraction specified, but will not be used, since vocab type is not 2.\n");
00228 }
00229 }
00230
00231 if (ng.vocab_type == 0) {
00232 ng.first_id = 1;
00233 }
00234 else {
00235 ng.first_id = 0;
00236 }
00237
00238
00239
00240
00241
00242 ng.min_alpha = pc_doublearg(&argc,argv,"-min_alpha",DEFAULT_MIN_ALPHA);
00243 ng.max_alpha = pc_doublearg(&argc,argv,"-max_alpha",DEFAULT_MAX_ALPHA);
00244 ng.out_of_range_alphas = pc_intarg(&argc,argv,"-out_of_range_alphas",
00245 DEFAULT_OUT_OF_RANGE_ALPHAS);
00246
00247 ng.min_alpha = pc_doublearg(&argc,argv,"-min_bo_weight",ng.min_alpha);
00248 ng.max_alpha = pc_doublearg(&argc,argv,"-max_bo_weight",ng.max_alpha);
00249 ng.out_of_range_alphas = pc_intarg(&argc,argv,"-out_of_range_bo_weights",
00250 ng.out_of_range_alphas);
00251
00252
00253
00254 if (ng.min_alpha >= ng.max_alpha) {
00255 quit(-1,"Error : Minimum of alpha range must be less than the maximum.\n");
00256 }
00257
00258 ng.discounting_method = 0;
00259
00260 if (pc_flagarg(&argc, argv,"-linear")) {
00261 ng.discounting_method = LINEAR;
00262 }
00263
00264 if (pc_flagarg(&argc,argv,"-absolute")) {
00265 if (ng.discounting_method != 0) {
00266 quit(-1,"Error : Assigned two contradictory discounting methods.\nSpecify one of -linear, -absolute, -good_turing or -witten_bell.\n");
00267 }
00268 ng.discounting_method = ABSOLUTE;
00269 }
00270
00271 if (pc_flagarg(&argc,argv,"-witten_bell")) {
00272 if (ng.discounting_method != 0) {
00273 quit(-1,"Error : Assigned two contradictory discounting methods.\nSpecify one of -linear, -absolute, -good_turing or -witten_bell.\n");
00274 }
00275 ng.discounting_method = WITTEN_BELL;
00276 }
00277
00278 if (pc_flagarg(&argc,argv,"-good_turing")) {
00279 if (ng.discounting_method != 0) {
00280 quit(-1,"Error : Assigned two contradictory discounting methods.\nSpecify one of -linear, -absolute, -good_turing or -witten_bell.\n");
00281 }
00282 ng.discounting_method = GOOD_TURING;
00283 }
00284
00285 if (ng.discounting_method == 0) {
00286 ng.discounting_method = GOOD_TURING;
00287 }
00288
00289 ng.disc_range = (unsigned short *) pc_shortarrayarg(&argc, argv, "-disc_ranges",ng.n,ng.n);
00290
00291 disc_range_set = (ng.disc_range != NULL);
00292
00293
00294 if (ng.discounting_method == GOOD_TURING) {
00295 if (!disc_range_set) {
00296 ng.disc_range = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng.n);
00297 ng.disc_range[0] = DEFAULT_DISC_RANGE_1;
00298 for (i=1;i<=ng.n-1;i++) {
00299 ng.disc_range[i] = DEFAULT_DISC_RANGE_REST;
00300 }
00301 }
00302 ng.fof_size = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng.n);
00303 for (i=0;i<=ng.n-1;i++) {
00304 ng.fof_size[i] = ng.disc_range[i]+1;
00305 }
00306 }
00307 else {
00308 if (disc_range_set) {
00309 pc_message(verbosity,2,"Warning : discount ranges specified will be ignored, since they only apply\nto Good Turing discounting.\n");
00310 }
00311 }
00312
00313 ng.four_byte_alphas = !(pc_flagarg(&argc, argv, "-two_byte_alphas") ||
00314 pc_flagarg(&argc, argv, "-two_byte_bo_weights"));
00315
00316 ng.four_byte_counts = pc_flagarg(&argc, argv, "-four_byte_counts");
00317
00318 ng.zeroton_fraction = pc_doublearg(&argc,argv,"-zeroton_fraction",1.0);
00319
00320
00321
00322 pc_message(verbosity,2," n : %d\n",ng.n);
00323 pc_message(verbosity,2," Input file : %s",ng.id_gram_filename);
00324 if (is_ascii) {
00325 pc_message(verbosity,2," (ascii format)\n");
00326 }
00327 else {
00328 pc_message(verbosity,2," (binary format)\n");
00329 }
00330 pc_message(verbosity,2," Output files :\n");
00331 if (ng.write_arpa) {
00332 pc_message(verbosity,2," ARPA format : %s\n",ng.arpa_filename);
00333 }
00334 if (ng.write_bin) {
00335 pc_message(verbosity,2," Binary format : %s\n",ng.bin_filename);
00336 }
00337
00338 pc_message(verbosity,2," Vocabulary file : %s\n",ng.vocab_filename);
00339 if (context_set) {
00340 pc_message(verbosity,2," Context cues file : %s\n",ng.context_cues_filename);
00341 }
00342 pc_message(verbosity,2," Cutoffs :\n ");
00343 for (i=0;i<=ng.n-2;i++) {
00344 pc_message(verbosity,2,"%d-gram : %d ",i+2,ng.cutoffs[i]);
00345 }
00346 pc_message(verbosity,2,"\n");
00347
00348 switch (ng.vocab_type) {
00349 case CLOSED_VOCAB:
00350 pc_message(verbosity,2," Vocabulary type : Closed\n");
00351 break;
00352 case OPEN_VOCAB_1:
00353 pc_message(verbosity,2," Vocabulary type : Open - type 1\n");
00354 break;
00355 case OPEN_VOCAB_2:
00356 pc_message(verbosity,2," Vocabulary type : Open - type 2\n");
00357 pc_message(verbosity,2," OOV fraction = %g\n",ng.oov_fraction);
00358 break;
00359 }
00360 pc_message(verbosity,2," Minimum unigram count : %d\n",ng.min_unicount);
00361 pc_message(verbosity,2," Zeroton fraction : %g\n",ng.zeroton_fraction);
00362 if (ng.four_byte_counts) {
00363 pc_message(verbosity,2," Counts will be stored in four bytes.\n");
00364 }
00365 else {
00366 pc_message(verbosity,2," Counts will be stored in two bytes.\n");
00367 pc_message(verbosity,2," Count table size : %d\n",ng.count_table_size);
00368 }
00369 pc_message(verbosity,2," Discounting method : ");
00370 switch (ng.discounting_method) {
00371 case GOOD_TURING:
00372 pc_message(verbosity,2,"Good-Turing\n");
00373 pc_message(verbosity,2," Discounting ranges :\n ");
00374 for (i=0;i<=ng.n-1;i++) {
00375 pc_message(verbosity,2,"%d-gram : %d ",i+1,ng.disc_range[i]);
00376 }
00377 pc_message(verbosity,2,"\n");
00378 break;
00379 case ABSOLUTE:
00380 pc_message(verbosity,2,"Absolute\n");
00381 break;
00382 case LINEAR:
00383 pc_message(verbosity,2,"Linear\n");
00384 break;
00385 case WITTEN_BELL:
00386 pc_message(verbosity,2,"Witten-Bell\n");
00387 break;
00388 }
00389 pc_message(verbosity,2," Memory allocation for tree structure : \n");
00390 switch(mem_alloc_method) {
00391 case TWO_PASSES:
00392 pc_message(verbosity,2," Perform a preliminary pass over the id n-gram file to determine \n the amount of memory to allocate\n");
00393 break;
00394 case BUFFER:
00395 pc_message(verbosity,2," Allocate %d MB of memory, shared equally between all n-gram tables.\n",buffer_size);
00396 break;
00397 case SPECIFIED:
00398 pc_message(verbosity,2," Memory requirement specified.\n ");
00399 for (i=0;i<=ng.n-2;i++) {
00400 pc_message(verbosity,2,"%d-gram : %d ",i+2,ng.table_sizes[i+1]);
00401 }
00402 pc_message(verbosity,2,"\n");
00403 break;
00404 }
00405 pc_message(verbosity,2," Back-off weight storage : \n");
00406 if (ng.four_byte_alphas) {
00407 pc_message(verbosity,2," Back-off weights will be stored in four bytes.\n");
00408 }
00409 else {
00410 pc_message(verbosity,2," Back-off weights will be stored in two bytes.\n");
00411 pc_message(verbosity,2," Minimum back-off weight : %g\n",ng.min_alpha);
00412 pc_message(verbosity,2," Maximum back-off weight : %g\n",ng.max_alpha);
00413 pc_message(verbosity,2," Maximum number of out of range back-off weights : %d\n",ng.out_of_range_alphas);
00414 }
00415
00416 pc_report_unk_args(&argc,argv,verbosity);
00417
00418
00419
00420
00421
00422
00423
00424 ng.id_gram_fp = rr_iopen(ng.id_gram_filename);
00425
00426
00427
00428
00429
00430
00431 if (context_set) {
00432 ng.context_cues_fp = rr_iopen(ng.context_cues_filename);
00433 }
00434
00435 if (ng.write_arpa) {
00436 ng.arpa_fp = rr_oopen(ng.arpa_filename);
00437 }
00438
00439 if (ng.write_bin) {
00440 ng.bin_fp = rr_oopen(ng.bin_filename);
00441 }
00442
00443
00444
00445
00446 pc_message(verbosity,2,"Reading vocabulary.\n");
00447
00448 ng.vocab_ht = sih_create(1000,0.5,2.0,1);
00449
00450 read_voc(ng.vocab_filename,verbosity,ng.vocab_ht,&ng.vocab,&(ng.vocab_size));
00451
00452
00453
00454 ng.no_of_ccs = 0;
00455 ng.context_cue = (flag *) rr_calloc(ng.vocab_size+1,sizeof(flag));
00456
00457 if (context_set) {
00458
00459 while (fgets (wlist_entry, sizeof (wlist_entry),ng.context_cues_fp)) {
00460 if (strncmp(wlist_entry,"##",2)==0) continue;
00461 sscanf (wlist_entry, "%s ",current_cc);
00462 if (strncmp(wlist_entry,"#",1)==0) {
00463 fprintf(stderr,"\n\n===========================================================\n");
00464 fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00465 fprintf(stderr, ">>> %s <<<\n",wlist_entry);
00466 fprintf(stderr, " '%s' will be included in the context cues list\n",current_cc);
00467 fprintf(stderr, " (comments must start with '##')\n");
00468 fprintf(stderr,"===========================================================\n\n");
00469 }
00470
00471
00472 if (sih_lookup(ng.vocab_ht,current_cc,¤t_cc_id) == 0) {
00473 pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
00474 }
00475 else {
00476 ng.context_cue[(unsigned short) current_cc_id] = 1;
00477 pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
00478 ng.no_of_ccs++;
00479 }
00480 }
00481 rr_iclose(ng.context_cues_fp);
00482 }
00483
00484 if ((sih_lookup(ng.vocab_ht,"<s>",&test_cc_id) != 0)) {
00485 if (ng.context_cue[(unsigned short) test_cc_id] == 0) {
00486 fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");
00487 }
00488 }
00489
00490 if ((sih_lookup(ng.vocab_ht,"<p>",&test_cc_id) != 0)) {
00491 if (ng.context_cue[(unsigned short) test_cc_id] == 0) {
00492 fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");
00493 }
00494 }
00495
00496 if ((sih_lookup(ng.vocab_ht,"<art>",&test_cc_id) != 0)) {
00497 if (ng.context_cue[(unsigned short) test_cc_id] == 0) {
00498 fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");
00499 }
00500 }
00501
00502
00503
00504
00505 if (ng.n>1) {
00506
00507 switch(mem_alloc_method) {
00508
00509 case TWO_PASSES:
00510 ng.table_sizes = (table_size_t *) rr_calloc(ng.n,sizeof(table_size_t));
00511 pc_message(verbosity,2,"Calculating memory requirement.\n");
00512 calc_mem_req(&ng,is_ascii);
00513 break;
00514 case BUFFER:
00515 ng.table_sizes = (table_size_t *) rr_malloc(ng.n*sizeof(table_size_t));
00516 middle_size = sizeof(count_ind_t) + sizeof(bo_weight_t) +
00517 sizeof(index__t) + sizeof(id__t);
00518 end_size = sizeof(count_ind_t) + sizeof(id__t);
00519 if (ng.four_byte_alphas) {
00520 middle_size += 2;
00521 }
00522 if (ng.four_byte_counts) {
00523 middle_size += 2;
00524 end_size += 2;
00525 }
00526
00527
00528 guess_mem(buffer_size,
00529 middle_size,
00530 end_size,
00531 ng.n,
00532 ng.table_sizes,
00533 verbosity);
00534 break;
00535 case SPECIFIED:
00536 break;
00537
00538 }
00539
00540 }
00541 else {
00542
00543 ng.table_sizes = (table_size_t *) rr_calloc(1,sizeof(table_size_t));
00544
00545 }
00546
00547
00548
00549 ng.table_sizes[0] = ng.vocab_size+1;
00550
00551
00552
00553 ng.count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng.n);
00554 ng.count4 = (int **) rr_malloc(sizeof(int *)*ng.n);
00555
00556
00557 ng.count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng.n);
00558
00559 if (!ng.four_byte_counts) {
00560 for (i=0;i<=ng.n-1;i++) {
00561 ng.count_table[i] = (count_t *) rr_calloc(ng.count_table_size,
00562 sizeof(count_t));
00563 }
00564 ng.marg_counts = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*
00565 ng.table_sizes[0]);
00566 }
00567 else {
00568 ng.marg_counts4 = (int *) rr_malloc(sizeof(int)*ng.table_sizes[0]);
00569 }
00570
00571 ng.word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng.n);
00572 if (ng.four_byte_alphas) {
00573 ng.bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng.n);
00574 }
00575 else {
00576 ng.bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng.n);
00577 }
00578
00579 ng.ind = (index__t **) rr_malloc(sizeof(index__t *)*ng.n);
00580
00581
00582
00583 if (ng.four_byte_counts) {
00584 ng.count4[0] = (int *) rr_calloc(ng.table_sizes[0],sizeof(int));
00585 }
00586 else {
00587 ng.count[0] = (count_ind_t *) rr_calloc(ng.table_sizes[0],
00588 sizeof(count_ind_t));
00589 }
00590 ng.uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
00591 ng.table_sizes[0]);
00592 ng.uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
00593 ng.table_sizes[0]);
00594 if (ng.four_byte_alphas) {
00595 ng.bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
00596 ng.table_sizes[0]);
00597 }
00598 else {
00599 ng.bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
00600 ng.table_sizes[0]);
00601 }
00602
00603 if (ng.n >=2 ) {
00604 ng.ind[0] = (index__t *) rr_calloc(ng.table_sizes[0],sizeof(index__t));
00605 }
00606
00607 for (i=1;i<=ng.n-2;i++) {
00608
00609 ng.word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng.table_sizes[i]);
00610 if (ng.four_byte_counts) {
00611 ng.count4[i] = (int *) rr_malloc(sizeof(int)*ng.table_sizes[i]);
00612 }
00613 else {
00614 ng.count[i] = (count_ind_t *)
00615 rr_malloc(sizeof(count_ind_t)*ng.table_sizes[i]);
00616 }
00617 if (ng.four_byte_alphas) {
00618 ng.bo_weight4[i] = (four_byte_t *)
00619 rr_malloc(sizeof(four_byte_t)*ng.table_sizes[i]);
00620 }
00621 else {
00622 ng.bo_weight[i] = (bo_weight_t *)
00623 rr_malloc(sizeof(bo_weight_t)*ng.table_sizes[i]);
00624 }
00625
00626 ng.ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng.table_sizes[i]);
00627
00628 mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) +
00629 sizeof(index__t) + sizeof(id__t);
00630
00631 if (ng.four_byte_alphas) {
00632 mem_alloced += 2;
00633 }
00634
00635 if (ng.four_byte_counts) {
00636 mem_alloced += 2;
00637 }
00638
00639 mem_alloced *= ng.table_sizes[i];
00640
00641 pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
00642 mem_alloced,i+1);
00643
00644 }
00645
00646 ng.word_id[ng.n-1] = (id__t *)
00647 rr_malloc(sizeof(id__t)*ng.table_sizes[ng.n-1]);
00648 if (ng.four_byte_counts) {
00649 ng.count4[ng.n-1] = (int *) rr_malloc(sizeof(int)*ng.table_sizes[ng.n-1]);
00650 pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
00651 (sizeof(int) +
00652 sizeof(id__t))*ng.table_sizes[ng.n-1],ng.n);
00653
00654 }
00655 else {
00656 ng.count[ng.n-1] = (count_ind_t *)
00657 rr_malloc(sizeof(count_ind_t)*ng.table_sizes[ng.n-1]);
00658 pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
00659 (sizeof(count_ind_t) +
00660 sizeof(id__t))*ng.table_sizes[ng.n-1],ng.n);
00661
00662 }
00663
00664
00665
00666
00667
00668
00669 ng.ptr_table = (int **) rr_malloc(sizeof(int *)*ng.n);
00670 ng.ptr_table_size = (unsigned short *)
00671 rr_calloc(ng.n,sizeof(unsigned short));
00672 for (i=0;i<=ng.n-1;i++) {
00673 ng.ptr_table[i] = (int *) rr_calloc(65535,sizeof(int));
00674 }
00675
00676
00677
00678 ng.alpha_array = (double *) rr_malloc(sizeof(double)*ng.out_of_range_alphas);
00679 ng.size_of_alpha_array = 0;
00680
00681
00682
00683 ng.freq_of_freq = (int **) rr_malloc(sizeof(int *)*ng.n);
00684
00685 switch(ng.discounting_method) {
00686 case LINEAR:
00687 for (i=0;i<=ng.n-1;i++) {
00688 ng.freq_of_freq[i] = (int *) rr_calloc(2,sizeof(int));
00689 }
00690 break;
00691 case GOOD_TURING:
00692 for (i=0;i<=ng.n-1;i++) {
00693 ng.freq_of_freq[i] = (int *) rr_calloc(ng.fof_size[i]+1,sizeof(int));
00694 }
00695 break;
00696 case ABSOLUTE:
00697 for (i=0;i<=ng.n-1;i++) {
00698 ng.freq_of_freq[i] = (int *) rr_calloc(3,sizeof(int));
00699 }
00700 ng.abs_disc_const = (double *) rr_malloc(sizeof(double)*ng.n);
00701
00702 break;
00703 case WITTEN_BELL:
00704 ng.freq_of_freq[0] = (int *) rr_calloc(1,sizeof(int));
00705 break;
00706
00707 }
00708
00709
00710
00711
00712
00713 pc_message(verbosity,2,"Processing id n-gram file.\n");
00714 pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
00715
00716
00717
00718 current_ngram.id_array = (id__t *) rr_calloc(ng.n,sizeof(id__t));
00719 previous_ngram.id_array = (id__t *) rr_calloc(ng.n,sizeof(id__t));
00720 current_ngram.n = ng.n;
00721 previous_ngram.n = ng.n;
00722
00723 ng.num_kgrams = (int *) rr_calloc(ng.n,sizeof(int));
00724 ng_count = (count_t *) rr_calloc(ng.n,sizeof(count_t));
00725 nlines = 1;
00726 ng.n_unigrams = 0;
00727
00728
00729
00730 get_ngram(ng.id_gram_fp,¤t_ngram,is_ascii);
00731
00732 contains_unks = 0;
00733 for (i=0;i<=ng.n-1;i++) {
00734 if (current_ngram.id_array[i] == 0) {
00735 contains_unks = 1;
00736 }
00737 }
00738
00739 while (ng.vocab_type == CLOSED_VOCAB && contains_unks){
00740
00741 get_ngram(ng.id_gram_fp,¤t_ngram,is_ascii);
00742 contains_unks = 0;
00743 for (i=0;i<=ng.n-1;i++) {
00744 if (current_ngram.id_array[i] == 0) {
00745 contains_unks = 1;
00746 }
00747 }
00748 }
00749
00750 for (i=0;i<=ng.n-2;i++) {
00751 ng.ind[i][0] = new_index(0,ng.ptr_table[i],&(ng.ptr_table_size[i]),0);
00752 ng.word_id[i+1][0] = current_ngram.id_array[i+1];
00753 ng.num_kgrams[i+1]++;
00754 ng_count[i] = current_ngram.count;
00755 }
00756 ng_count[0] = current_ngram.count;
00757
00758 if (ng.discounting_method == GOOD_TURING &&
00759 current_ngram.count <= ng.fof_size[ng.n-1]) {
00760
00761 if (current_ngram.count <= 0) {
00762 quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00763 }
00764
00765 ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00766 }
00767
00768 if (ng.discounting_method == LINEAR && current_ngram.count == 1) {
00769 ng.freq_of_freq[ng.n-1][1]++;
00770 }
00771
00772 if (ng.discounting_method == ABSOLUTE && current_ngram.count <= 2) {
00773
00774 if (current_ngram.count <= 0) {
00775 quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00776 }
00777
00778 ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00779 }
00780
00781 store_count(ng.four_byte_counts,
00782 ng.count_table[ng.n-1],
00783 ng.count_table_size,
00784 ng.count[ng.n-1],
00785 ng.count4[ng.n-1],
00786 0,
00787 current_ngram.count);
00788
00789 if (current_ngram.count <= ng.cutoffs[ng.n-2]) {
00790 ng.num_kgrams[ng.n-1]--;
00791 }
00792 prev_id1 = current_ngram.id_array[0];
00793
00794 displayed_oov_warning = 0;
00795
00796 for (i=0;i<=ng.n-1;i++) {
00797 previous_ngram.id_array[i] = current_ngram.id_array[i];
00798 }
00799 previous_ngram.count = current_ngram.count;
00800
00801
00802 while (!rr_feof(ng.id_gram_fp)) {
00803
00804
00805 if (get_ngram(ng.id_gram_fp,¤t_ngram,is_ascii)) {
00806
00807 if (ng.vocab_type == CLOSED_VOCAB) {
00808 contains_unks = 0;
00809 for (i=0;i<=ng.n-1;i++) {
00810 if (current_ngram.id_array[i] == 0) {
00811 contains_unks = 1;
00812 }
00813 }
00814 }
00815
00816 if (!contains_unks || ng.vocab_type != CLOSED_VOCAB) {
00817
00818
00819
00820
00821
00822
00823
00824
00825 pos_of_novelty = ng.n;
00826
00827 for (i=0;i<=ng.n-1;i++) {
00828 if (current_ngram.id_array[i] > previous_ngram.id_array[i]) {
00829 pos_of_novelty = i;
00830 i=ng.n;
00831 }
00832 else {
00833 if (current_ngram.id_array[i] < previous_ngram.id_array[i]) {
00834 if (nlines < 5) {
00835 quit(-1,"Error : n-gram ordering problem - could be due to using wrong file format.\nCheck whether id n-gram file is in ascii or binary format.\n");
00836 }
00837 else {
00838 quit(-1,"Error : n-grams are not correctly ordered. Error occurred at ngram %d.\n",nlines);
00839 }
00840 }
00841 }
00842 }
00843
00844 if (pos_of_novelty == ng.n) {
00845 if (nlines > 3) {
00846 quit(-1,"Error - same n-gram appears twice in idngram stream.\n");
00847 }
00848 else {
00849 quit(-1,"Error in the idngram stream. It appears that the same n-gram occurs twice\n in the stream. Check that text2idngram exited successfully, and the \nformat (binary/ascii) of the idngram file.\n");
00850 }
00851 }
00852
00853 nlines++;
00854
00855 if (nlines % 20000 == 0) {
00856 if (nlines % 1000000 == 0) {
00857 pc_message(verbosity,2,".\n");
00858 }
00859 else {
00860 pc_message(verbosity,2,".");
00861 }
00862 }
00863
00864
00865
00866
00867
00868
00869 if (ng.n > 1) {
00870
00871 store_count(ng.four_byte_counts,
00872 ng.count_table[ng.n-1],
00873 ng.count_table_size,
00874 ng.count[ng.n-1],
00875 ng.count4[ng.n-1],
00876 ng.num_kgrams[ng.n-1],
00877 current_ngram.count);
00878
00879
00880 if (ng.discounting_method == GOOD_TURING &&
00881 current_ngram.count <= ng.fof_size[ng.n-1]) {
00882
00883 if (current_ngram.count <= 0) {
00884 quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00885 }
00886
00887 ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00888 }
00889
00890 if (ng.discounting_method == LINEAR && current_ngram.count == 1) {
00891 ng.freq_of_freq[ng.n-1][1]++;
00892 }
00893
00894 if (ng.discounting_method == ABSOLUTE && current_ngram.count <= 2) {
00895
00896 if (current_ngram.count <= 0) {
00897 quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00898 }
00899
00900 ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00901 }
00902
00903 ng.word_id[ng.n-1][ng.num_kgrams[ng.n-1]] =
00904 current_ngram.id_array[ng.n-1];
00905
00906 ng.num_kgrams[ng.n-1]++;
00907
00908
00909 if (ng.num_kgrams[ng.n-1] >= ng.table_sizes[ng.n-1]) {
00910 quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng.table_sizes[ng.n-1],ng.n);
00911 }
00912
00913 }
00914
00915
00916 for (i=ng.n-2;i>=MAX(1,pos_of_novelty);i--) {
00917
00918 if (ng.discounting_method == GOOD_TURING &&
00919 ng_count[i] <= ng.fof_size[i]) {
00920 ng.freq_of_freq[i][ng_count[i]]++;
00921 }
00922
00923 if (ng.discounting_method == LINEAR && ng_count[i] == 1) {
00924 ng.freq_of_freq[i][1]++;
00925 }
00926
00927 if (ng.discounting_method == ABSOLUTE && ng_count[i] <= 2) {
00928 ng.freq_of_freq[i][ng_count[i]]++;
00929 }
00930
00931 if (ng_count[i] <= ng.cutoffs[i-1]) {
00932 ng.num_kgrams[i]--;
00933 }
00934 else {
00935 store_count(ng.four_byte_counts,
00936 ng.count_table[i],
00937 ng.count_table_size,
00938 ng.count[i],
00939 ng.count4[i],
00940 ng.num_kgrams[i]-1,
00941 ng_count[i]);
00942 }
00943 ng_count[i] = current_ngram.count;
00944 ng.word_id[i][ng.num_kgrams[i]] = current_ngram.id_array[i];
00945 ng.ind[i][ng.num_kgrams[i]] = new_index(ng.num_kgrams[i+1]-1,
00946 ng.ptr_table[i],
00947 &(ng.ptr_table_size[i]),
00948 ng.num_kgrams[i]);
00949
00950 ng.num_kgrams[i]++;
00951
00952 if (ng.num_kgrams[i] >= ng.table_sizes[i]) {
00953 quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng.table_sizes[i],i+1);
00954 }
00955
00956 }
00957
00958
00959
00960
00961
00962
00963
00964 for (i=0;i<=pos_of_novelty-1;i++) {
00965 ng_count[i] += current_ngram.count;
00966 }
00967
00968
00969
00970 if (pos_of_novelty == 0) {
00971 if (ng.n>1) {
00972
00973 for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
00974 ng.ind[0][i] = new_index(ng.num_kgrams[1]-1,
00975 ng.ptr_table[0],
00976 &(ng.ptr_table_size[0]),
00977 i);
00978 }
00979 prev_id1 = current_ngram.id_array[0];
00980
00981 }
00982
00983 if (ng.discounting_method == GOOD_TURING &&
00984 ng_count[0] <= ng.fof_size[0]) {
00985 ng.freq_of_freq[0][ng_count[0]]++;
00986 }
00987
00988 if (ng.discounting_method == LINEAR && ng_count[0] == 1) {
00989 ng.freq_of_freq[0][1]++;
00990 }
00991
00992 if (ng.discounting_method == ABSOLUTE && ng_count[0] <= 2) {
00993 ng.freq_of_freq[0][ng_count[0]]++;
00994 }
00995
00996 if (!ng.context_cue[previous_ngram.id_array[0]]) {
00997 ng.n_unigrams += ng_count[0];
00998
00999 store_count(ng.four_byte_counts,
01000 ng.count_table[0],
01001 ng.count_table_size,
01002 ng.count[0],
01003 ng.count4[0],
01004 previous_ngram.id_array[0],
01005 ng_count[0]);
01006
01007 }
01008
01009 store_count(ng.four_byte_counts,
01010 ng.count_table[0],
01011 ng.count_table_size,
01012 ng.marg_counts,
01013 ng.marg_counts4,
01014 previous_ngram.id_array[0],
01015 ng_count[0]);
01016
01017 ng_count[0] = current_ngram.count;
01018 }
01019
01020 if (current_ngram.count <= ng.cutoffs[ng.n-2]) {
01021 ng.num_kgrams[ng.n-1]--;
01022 }
01023
01024 for (i=0;i<=ng.n-1;i++) {
01025 previous_ngram.id_array[i] = current_ngram.id_array[i];
01026 }
01027 previous_ngram.count = current_ngram.count;
01028
01029
01030 }
01031 else {
01032 if (!displayed_oov_warning){
01033 pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
01034 displayed_oov_warning = 1;
01035 }
01036 }
01037 }
01038 }
01039
01040 rr_iclose(ng.id_gram_fp);
01041
01042 for (i=ng.n-2;i>=1;i--) {
01043 if (ng.discounting_method == GOOD_TURING &&
01044 ng_count[i] <= ng.fof_size[i]) {
01045 ng.freq_of_freq[i][ng_count[i]]++;
01046 }
01047
01048 if (ng.discounting_method == LINEAR && ng_count[i] == 1) {
01049 ng.freq_of_freq[i][1]++;
01050 }
01051
01052 if (ng.discounting_method == ABSOLUTE && ng_count[i] <= 2) {
01053 ng.freq_of_freq[i][ng_count[i]]++;
01054 }
01055
01056 if (ng_count[i] <= ng.cutoffs[i-1]) {
01057 ng.num_kgrams[i]--;
01058 }
01059 else {
01060
01061 store_count(ng.four_byte_counts,
01062 ng.count_table[i],
01063 ng.count_table_size,
01064 ng.count[i],
01065 ng.count4[i],
01066 ng.num_kgrams[i]-1,
01067 ng_count[i]);
01068
01069 }
01070 }
01071
01072 if (ng.discounting_method == GOOD_TURING && ng_count[0] <= ng.fof_size[0]) {
01073 ng.freq_of_freq[0][ng_count[0]]++;
01074 }
01075
01076
01077 if (ng.discounting_method == LINEAR && ng_count[0] == 1) {
01078 ng.freq_of_freq[0][1]++;
01079 }
01080
01081 if (ng.discounting_method == ABSOLUTE && ng_count[0] <= 2) {
01082 ng.freq_of_freq[0][ng_count[0]]++;
01083 }
01084
01085 if (!ng.context_cue[current_ngram.id_array[0]]) {
01086 ng.n_unigrams += ng_count[0];
01087
01088 store_count(ng.four_byte_counts,
01089 ng.count_table[0],
01090 ng.count_table_size,
01091 ng.count[0],
01092 ng.count4[0],
01093 current_ngram.id_array[0],
01094 ng_count[0]);
01095
01096 }
01097
01098 store_count(ng.four_byte_counts,
01099 ng.count_table[0],
01100 ng.count_table_size,
01101 ng.marg_counts,
01102 ng.marg_counts4,
01103 current_ngram.id_array[0],
01104 ng_count[0]);
01105
01106 if (ng.n>1) {
01107
01108 for (i=current_ngram.id_array[0]+1;i<=ng.vocab_size;i++) {
01109 ng.ind[0][i] = new_index(ng.num_kgrams[1],
01110 ng.ptr_table[0],
01111 &(ng.ptr_table_size[0]),
01112 current_ngram.id_array[0]);
01113 }
01114 }
01115
01116 pc_message(verbosity,2,"\n");
01117
01118
01119
01120 if (ng.min_unicount > 0) {
01121
01122 int nchanged;
01123
01124 nchanged = 0;
01125
01126 for (i=ng.first_id;i<=ng.vocab_size;i++) {
01127 if ((return_count(ng.four_byte_counts,
01128 ng.count_table[0],
01129 ng.count[0],
01130 ng.count4[0],
01131 i) < ng.min_unicount) && !ng.context_cue[i]) {
01132
01133 switch(ng.discounting_method) {
01134 case LINEAR:
01135 if (ng.count[0][i] <= 1) {
01136 ng.freq_of_freq[0][ng.count[0][i]]--;
01137 }
01138 break;
01139 case ABSOLUTE:
01140 if (ng.count[0][i] <= 2) {
01141 ng.freq_of_freq[0][ng.count[0][i]]--;
01142 }
01143 case GOOD_TURING:
01144 if (ng.count[0][i] <= ng.fof_size[0]) {
01145 ng.freq_of_freq[0][ng.count[0][i]]--;
01146 }
01147 break;
01148 case WITTEN_BELL:
01149 if (ng.count[0][i] == 0) {
01150 ng.freq_of_freq[0][ng.count[0][i]]--;
01151 }
01152 break;
01153 }
01154 ng.n_unigrams += (ng.min_unicount - ng.count[0][i]);
01155
01156 store_count(ng.four_byte_counts,
01157 ng.count_table[0],
01158 ng.count_table_size,
01159 ng.count[0],
01160 ng.count4[0],
01161 i,
01162 ng.min_unicount);
01163
01164 nchanged++;
01165 }
01166 }
01167
01168 if (nchanged > 0) {
01169 pc_message(verbosity,2,
01170 "Unigram counts of %d words were bumped up to %d.\n",
01171 nchanged,ng.min_unicount);
01172 }
01173
01174 }
01175
01176
01177
01178 ng.freq_of_freq[0][0] = 0;
01179
01180 for (i=ng.first_id;i<=ng.vocab_size;i++) {
01181 if (return_count(ng.four_byte_counts,
01182 ng.count_table[0],
01183 ng.count[0],
01184 ng.count4[0],
01185 i) == 0) {
01186 ng.freq_of_freq[0][0]++;
01187 }
01188 }
01189
01190
01191 if (ng.discounting_method == GOOD_TURING) {
01192 for (i=0;i<=ng.n-1;i++) {
01193 for (j=1;j<=ng.fof_size[i];j++) {
01194 pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng.freq_of_freq[i][j]);
01195 }
01196 }
01197 }
01198
01199
01200
01201
01202
01203
01204 pc_message(verbosity,2,"Calculating discounted counts.\n");
01205
01206 switch(ng.discounting_method) {
01207 case GOOD_TURING:
01208
01209 ng.gt_disc_ratio = (disc_val_t **) rr_malloc(sizeof(disc_val_t *)*ng.n);
01210
01211 for (i=0;i<=ng.n-1;i++) {
01212 ng.gt_disc_ratio[i] = (disc_val_t *)
01213 rr_malloc(sizeof(disc_val_t)*ng.fof_size[i]);
01214 }
01215
01216 for (i=0;i<=ng.n-1;i++) {
01217 if (i==0) {
01218 compute_gt_discount(i+1,
01219 ng.freq_of_freq[0],
01220 ng.fof_size[0],
01221 &ng.disc_range[0],
01222 0,
01223 verbosity,
01224 &ng.gt_disc_ratio[0]);
01225 }
01226 else {
01227 compute_gt_discount(i+1,
01228 ng.freq_of_freq[i],
01229 ng.fof_size[i],
01230 &ng.disc_range[i],
01231 ng.cutoffs[i-1],
01232 verbosity,
01233 &ng.gt_disc_ratio[i]);
01234 }
01235 }
01236 break;
01237 case WITTEN_BELL:
01238 break;
01239 case LINEAR:
01240 ng.lin_disc_ratio = (disc_val_t *) rr_malloc(sizeof(disc_val_t)*ng.n);
01241 pc_message(verbosity,1,"Linear discounting ratios :\n");
01242 for (i=0;i<=ng.n-1;i++) {
01243 ng.lin_disc_ratio[i] = 1 - ( (float) ng.freq_of_freq[i][1]/
01244 (float) ng.n_unigrams);
01245 pc_message(verbosity,1,"%d-gram : %g\n",i+1,ng.lin_disc_ratio[i]);
01246 }
01247
01248 break;
01249 case ABSOLUTE:
01250 pc_message(verbosity,1,"Absolute discounting ratios :\n");
01251 for (i=0;i<=ng.n-1;i++) {
01252 ng.abs_disc_const[i] = ((float) ng.freq_of_freq[i][1] ) /
01253 ((float) ng.freq_of_freq[i][1] + (2*ng.freq_of_freq[i][2]) );
01254 pc_message(verbosity,1,"%d-gram : ",i+1);
01255 for (j=1;j<=5;j++) {
01256 pc_message(verbosity,1,"%g ",(j-ng.abs_disc_const[i])/j);
01257 }
01258 pc_message(verbosity,1," ... \n");
01259 }
01260 break;
01261 }
01262
01263
01264
01265
01266 compute_unigram(&ng,verbosity);
01267
01268
01269
01270
01271 if (ng.discounting_method == GOOD_TURING) {
01272 pc_message(verbosity,2,"Incrementing contexts...\n");
01273
01274 for (i=ng.n-1;i>=1;i--) {
01275
01276 increment_context(&ng,i,verbosity);
01277
01278 }
01279 }
01280
01281
01282
01283
01284 pc_message(verbosity,2,"Calculating back-off weights...\n");
01285
01286 for (i=1;i<=ng.n-1;i++) {
01287 compute_back_off(&ng,i,verbosity);
01288 }
01289
01290 if (!ng.four_byte_alphas) {
01291 pc_message(verbosity,3,"Number of out of range alphas = %d\n",
01292 ng.size_of_alpha_array);
01293 }
01294
01295
01296
01297 pc_message(verbosity,2,"Writing out language model...\n");
01298
01299 if (ng.write_arpa) {
01300
01301 write_arpa_lm(&ng,verbosity);
01302
01303 }
01304
01305 if (ng.write_bin) {
01306
01307 write_bin_lm(&ng,verbosity);
01308
01309 }
01310
01311 pc_message(verbosity,0,"idngram2lm : Done.\n");
01312
01313 exit(0);
01314
01315 }
01316
01317