Main Page   Compound List   File List   Compound Members   File Members  

idngram2lm.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #include "toolkit.h"
00021 #include "ngram.h"
00022 #include "pc_libs/pc_general.h"
00023 #include "idngram2lm.h"
00024 #include "rr_libs/sih.h"
00025 #include "rr_libs/general.h"
00026 #include <stdio.h>
00027 #include <stdlib.h>
00028 #include <string.h>
00029 
00035 void main (int argc, char **argv) {
00036 
00037   int i,j;
00038   ng_t ng;
00039   int verbosity;
00040   int mem_alloc_method; /* Method used to decide how much memory to 
00041                            allocate for count tables */
00042   int buffer_size;
00043   flag is_ascii;
00044   flag ascii_flag;
00045   flag bin_flag;
00046   char current_cc[200];
00047   int current_cc_id;
00048   int test_cc_id;
00049   ngram current_ngram;
00050   ngram previous_ngram;
00051   int *ng_count; /* Array indicating the number of occurrances of 
00052                     the current 1-gram, 2-gram, ... ,n-gram */  
00053   int nlines;
00054   int pos_of_novelty;
00055   int prev_id1;
00056   flag contains_unks;
00057   int mem_alloced;
00058   char wlist_entry[1024];
00059   int end_size;
00060   int middle_size;
00061 
00062   flag displayed_oov_warning;
00063 
00064   flag context_set;
00065   flag oov_frac_set;
00066   flag disc_range_set;
00067 
00068   /*  ------------------  Process command line --------------------- */
00069 
00070   report_version(&argc,argv);
00071 
00072   if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
00073     
00074     /* Display help message */
00075 
00076     fprintf(stderr,"idngram2lm : Convert an idngram file to a language model file.\n");
00077     fprintf(stderr,"Usage : \n");
00078     fprintf(stderr,"idngram2lm -idngram .idngram\n");
00079     fprintf(stderr,"           -vocab .vocab\n");
00080     fprintf(stderr,"           -arpa .arpa | -binary .binlm\n");
00081     fprintf(stderr,"         [ -context .ccs ]\n");
00082     fprintf(stderr,"         [ -calc_mem | -buffer 100 | -spec_num y ... z ]\n");
00083     fprintf(stderr,"         [ -vocab_type 1 ]\n");
00084     fprintf(stderr,"         [ -oov_fraction 0.5 ]\n");
00085     fprintf(stderr,"         [ -two_byte_bo_weights   \n              [ -min_bo_weight nnnnn] [ -max_bo_weight nnnnn] [ -out_of_range_bo_weights] ]\n");
00086     fprintf(stderr,"         [ -four_byte_counts ]\n");
00087     fprintf(stderr,"         [ -linear | -absolute | -good_turing | -witten_bell ]\n");
00088     fprintf(stderr,"         [ -disc_ranges 1 7 7 ]\n");
00089     fprintf(stderr,"         [ -cutoffs 0 ... 0 ]\n");
00090     fprintf(stderr,"         [ -min_unicount 0 ]\n");
00091     fprintf(stderr,"         [ -zeroton_fraction ]\n");
00092     fprintf(stderr,"         [ -ascii_input | -bin_input ]\n");
00093     fprintf(stderr,"         [ -n 3 ]  \n");
00094     fprintf(stderr,"         [ -verbosity %d ]\n",DEFAULT_VERBOSITY);
00095     exit(1);
00096   }
00097 
00098   verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
00099   ng.n = pc_intarg(&argc, argv,"-n",DEFAULT_N);
00100 
00101   if (ng.n<1) {
00102     quit(-1,"Error: Value of n must be positive.\n");
00103   }
00104 
00105   ng.cutoffs = (cutoff_t *) pc_shortarrayarg(&argc, argv, "-cutoffs",ng.n-1,ng.n-1);
00106 
00107   if (ng.cutoffs == NULL) {
00108     ng.cutoffs = (cutoff_t *) rr_calloc(ng.n-1,sizeof(cutoff_t));
00109   }
00110 
00111   for (i=0;i<=ng.n-3;i++) {
00112     if (ng.cutoffs[i+1] < ng.cutoffs[i]) {
00113       quit(-1,"Error - cutoffs for (n+1)-gram must be greater than or equal to those for \nn-gram. You have %d-gram cutoff = %d > %d-gram cutoff = %d.\n",i+2,ng.cutoffs[i],i+3,ng.cutoffs[i+1]);
00114     }
00115   }
00116 
00117   mem_alloc_method = 0;
00118 
00119   if (pc_flagarg(&argc, argv,"-calc_mem")) {
00120     mem_alloc_method = TWO_PASSES;
00121   }
00122   
00123   buffer_size = pc_intarg(&argc, argv,"-buffer",-1);
00124 
00125   if (buffer_size != -1) {
00126     if (mem_alloc_method != 0) {
00127       quit(-1,"Assigned two contradictory methods of memory allocation.\n Use one of -calc_mem, -buffer, or -spec_num.\n");
00128     }
00129     mem_alloc_method = BUFFER;
00130   }
00131  
00132   ng.table_sizes = pc_intarrayarg(&argc, argv, "-spec_num",ng.n-1,ng.n);
00133 
00134   if (ng.table_sizes != NULL) {
00135 
00136     if (mem_alloc_method != 0) {
00137       quit(-1,"Assigned two contradictory methods of memory allocation.\n Use one of -calc_mem, -guess, or -spec_num.\n");
00138     }
00139     mem_alloc_method = SPECIFIED;
00140     for (i=ng.n-1;i>=1;i--) {
00141       ng.table_sizes[i] = ng.table_sizes[i-1];
00142     }
00143   }
00144 
00145   if (mem_alloc_method == 0) {
00146     mem_alloc_method = BUFFER;
00147     buffer_size = STD_MEM;
00148   }
00149   
00150   ng.min_unicount = pc_intarg(&argc, argv, "-min_unicount",0);
00151 
00152   ng.id_gram_filename = salloc(pc_stringarg(&argc, argv,"-idngram",""));
00153 
00154   if (!strcmp(ng.id_gram_filename,"")) {
00155     quit(-1,"Error: id ngram file not specified. Use the -idngram flag.\n");
00156   }
00157 
00158   if (!strcmp(ng.id_gram_filename,"-") && mem_alloc_method == TWO_PASSES) {
00159     quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
00160   }
00161 
00162   is_ascii = 0;
00163 
00164   ascii_flag = pc_flagarg(&argc,argv,"-ascii_input");
00165   bin_flag = pc_flagarg(&argc,argv,"-bin_input");
00166 
00167   if (ascii_flag || 
00168       !strcmp(&ng.id_gram_filename[strlen(ng.id_gram_filename)-6],".ascii")) {
00169     is_ascii = 1;
00170   }
00171   else {
00172   }
00173   
00174   if (ascii_flag) {
00175     
00176     if (bin_flag) {
00177       quit(-1,"Error : Specify only one of -bin_input and -ascii_input\n");
00178     }
00179     
00180     if (!strcmp(&ng.id_gram_filename[strlen(ng.id_gram_filename)-4],".bin")) {
00181       quit(-1,"Error : -ascii_input flag specified, but input file has .bin extention.\n");
00182     }
00183     
00184   }
00185   
00186   if (bin_flag && 
00187       !strcmp(&ng.id_gram_filename[strlen(ng.id_gram_filename)-6],".ascii") ) {
00188     quit(-1,"Error : -bin_input flag specified, but input file has .ascii extention.\n");
00189   }
00190   
00191   ng.arpa_filename = salloc(pc_stringarg(&argc, argv,"-arpa",""));
00192   ng.bin_filename = salloc(pc_stringarg(&argc, argv,"-binary",""));
00193   
00194   ng.write_arpa = strcmp("",ng.arpa_filename);
00195   ng.write_bin = strcmp("",ng.bin_filename);
00196   
00197   if (!(ng.write_arpa || ng.write_bin)) {
00198     quit(-1,"Error : must specify either an arpa, or a binary output file.\n");
00199   }
00200   
00201   ng.count_table_size = DEFAULT_COUNT_TABLE_SIZE;
00202   
00203   ng.vocab_filename = salloc(pc_stringarg(&argc,argv,"-vocab",""));
00204   
00205 
00206 
00207   if (!strcmp("",ng.vocab_filename)) {
00208     quit(-1,"Error : vocabulary file not specified. Use the -vocab option.\n");
00209   }
00210 
00211 
00212   ng.context_cues_filename = salloc(pc_stringarg(&argc,argv,"-context",""));
00213 
00214   context_set = strcmp("", ng.context_cues_filename);
00215 
00216   ng.vocab_type = pc_intarg(&argc,argv,"-vocab_type",1);
00217   
00218   ng.oov_fraction = pc_doublearg(&argc, argv,"-oov_fraction",-1.0);
00219 
00220   if (ng.oov_fraction == -1.0) {
00221     oov_frac_set = 0;    
00222     ng.oov_fraction=DEFAULT_OOV_FRACTION;
00223   }
00224   else {
00225     oov_frac_set = 1;
00226     if (ng.vocab_type != 2) {
00227       pc_message(verbosity,1,"Warning : OOV fraction specified, but will not be used, since vocab type is not 2.\n");
00228     }
00229   }
00230 
00231   if (ng.vocab_type == 0) {
00232     ng.first_id = 1;
00233   }
00234   else {
00235     ng.first_id = 0;
00236   }
00237 
00238   /* Allow both "min_alpha" etc and "min_bo_weight" etc as valid
00239      syntax. The "bo_weight" form is preferred, but the "alpha" form is
00240      maintained as it was present in version 2.00 */
00241 
00242   ng.min_alpha = pc_doublearg(&argc,argv,"-min_alpha",DEFAULT_MIN_ALPHA);
00243   ng.max_alpha = pc_doublearg(&argc,argv,"-max_alpha",DEFAULT_MAX_ALPHA);
00244   ng.out_of_range_alphas = pc_intarg(&argc,argv,"-out_of_range_alphas",
00245                                      DEFAULT_OUT_OF_RANGE_ALPHAS);
00246 
00247   ng.min_alpha = pc_doublearg(&argc,argv,"-min_bo_weight",ng.min_alpha);
00248   ng.max_alpha = pc_doublearg(&argc,argv,"-max_bo_weight",ng.max_alpha);
00249   ng.out_of_range_alphas = pc_intarg(&argc,argv,"-out_of_range_bo_weights",
00250                                      ng.out_of_range_alphas);
00251 
00252 
00253   
00254   if (ng.min_alpha >= ng.max_alpha) {
00255     quit(-1,"Error : Minimum of alpha range must be less than the maximum.\n");
00256   }
00257 
00258   ng.discounting_method = 0;
00259   
00260   if (pc_flagarg(&argc, argv,"-linear")) {
00261     ng.discounting_method = LINEAR;
00262   }
00263 
00264   if (pc_flagarg(&argc,argv,"-absolute")) {
00265     if (ng.discounting_method != 0) {
00266       quit(-1,"Error : Assigned two contradictory discounting methods.\nSpecify one of -linear, -absolute, -good_turing or -witten_bell.\n");
00267     }
00268     ng.discounting_method = ABSOLUTE;
00269   }
00270 
00271   if (pc_flagarg(&argc,argv,"-witten_bell")) {
00272     if (ng.discounting_method != 0) {
00273       quit(-1,"Error : Assigned two contradictory discounting methods.\nSpecify one of -linear, -absolute, -good_turing or -witten_bell.\n");
00274     }
00275     ng.discounting_method = WITTEN_BELL;
00276   }
00277 
00278   if (pc_flagarg(&argc,argv,"-good_turing")) {
00279     if (ng.discounting_method != 0) {
00280       quit(-1,"Error : Assigned two contradictory discounting methods.\nSpecify one of -linear, -absolute, -good_turing or -witten_bell.\n");
00281     }
00282     ng.discounting_method = GOOD_TURING;
00283   }
00284 
00285   if (ng.discounting_method == 0) {
00286     ng.discounting_method = GOOD_TURING;
00287   }
00288 
00289   ng.disc_range = (unsigned short *) pc_shortarrayarg(&argc, argv, "-disc_ranges",ng.n,ng.n);
00290 
00291   disc_range_set = (ng.disc_range != NULL);
00292 
00293 
00294   if (ng.discounting_method == GOOD_TURING) {
00295     if (!disc_range_set) {
00296       ng.disc_range = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng.n);
00297       ng.disc_range[0] = DEFAULT_DISC_RANGE_1;
00298       for (i=1;i<=ng.n-1;i++) {
00299         ng.disc_range[i] = DEFAULT_DISC_RANGE_REST;
00300       }
00301     }
00302     ng.fof_size = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng.n);
00303     for (i=0;i<=ng.n-1;i++) {
00304       ng.fof_size[i] = ng.disc_range[i]+1;
00305     }
00306   }
00307   else {
00308     if (disc_range_set) {
00309       pc_message(verbosity,2,"Warning : discount ranges specified will be ignored, since they only apply\nto Good Turing discounting.\n");
00310     }
00311   }
00312 
00313   ng.four_byte_alphas = !(pc_flagarg(&argc, argv, "-two_byte_alphas") || 
00314                           pc_flagarg(&argc, argv, "-two_byte_bo_weights"));
00315 
00316   ng.four_byte_counts = pc_flagarg(&argc, argv, "-four_byte_counts");
00317 
00318   ng.zeroton_fraction = pc_doublearg(&argc,argv,"-zeroton_fraction",1.0);
00319 
00320   /* Report parameters */
00321 
00322   pc_message(verbosity,2,"  n : %d\n",ng.n);
00323   pc_message(verbosity,2,"  Input file : %s",ng.id_gram_filename);
00324   if (is_ascii) {
00325     pc_message(verbosity,2,"     (ascii format)\n");
00326   }
00327   else {
00328     pc_message(verbosity,2,"     (binary format)\n");
00329   }
00330   pc_message(verbosity,2,"  Output files :\n");
00331   if (ng.write_arpa) {
00332     pc_message(verbosity,2,"     ARPA format   : %s\n",ng.arpa_filename);
00333   }
00334   if (ng.write_bin) {
00335     pc_message(verbosity,2,"     Binary format : %s\n",ng.bin_filename);
00336   }
00337 
00338   pc_message(verbosity,2,"  Vocabulary file : %s\n",ng.vocab_filename);
00339   if (context_set) {
00340     pc_message(verbosity,2,"  Context cues file : %s\n",ng.context_cues_filename);
00341   }
00342   pc_message(verbosity,2,"  Cutoffs :\n     ");
00343   for (i=0;i<=ng.n-2;i++) {
00344     pc_message(verbosity,2,"%d-gram : %d     ",i+2,ng.cutoffs[i]);
00345   }
00346   pc_message(verbosity,2,"\n");
00347 
00348   switch (ng.vocab_type) {
00349   case CLOSED_VOCAB:
00350     pc_message(verbosity,2,"  Vocabulary type : Closed\n");
00351     break;
00352   case OPEN_VOCAB_1:
00353     pc_message(verbosity,2,"  Vocabulary type : Open - type 1\n");
00354     break;
00355   case OPEN_VOCAB_2:
00356     pc_message(verbosity,2,"  Vocabulary type : Open - type 2\n");
00357     pc_message(verbosity,2,"     OOV fraction = %g\n",ng.oov_fraction);
00358     break;
00359   }
00360   pc_message(verbosity,2,"  Minimum unigram count : %d\n",ng.min_unicount);
00361   pc_message(verbosity,2,"  Zeroton fraction : %g\n",ng.zeroton_fraction);
00362   if (ng.four_byte_counts) { 
00363     pc_message(verbosity,2,"  Counts will be stored in four bytes.\n");
00364   }
00365   else {
00366     pc_message(verbosity,2,"  Counts will be stored in two bytes.\n");
00367     pc_message(verbosity,2,"  Count table size : %d\n",ng.count_table_size);
00368   }
00369   pc_message(verbosity,2,"  Discounting method : ");
00370   switch (ng.discounting_method) {
00371   case GOOD_TURING:
00372     pc_message(verbosity,2,"Good-Turing\n");
00373     pc_message(verbosity,2,"     Discounting ranges :\n        ");
00374     for (i=0;i<=ng.n-1;i++) {
00375       pc_message(verbosity,2,"%d-gram : %d     ",i+1,ng.disc_range[i]);
00376     }
00377     pc_message(verbosity,2,"\n");
00378     break;
00379   case ABSOLUTE:
00380     pc_message(verbosity,2,"Absolute\n");
00381     break;
00382   case LINEAR:
00383     pc_message(verbosity,2,"Linear\n");
00384     break;
00385   case WITTEN_BELL:
00386     pc_message(verbosity,2,"Witten-Bell\n");
00387     break;
00388   }
00389   pc_message(verbosity,2,"  Memory allocation for tree structure : \n");
00390   switch(mem_alloc_method) {
00391   case TWO_PASSES:
00392     pc_message(verbosity,2,"     Perform a preliminary pass over the id n-gram file to determine \n     the amount of memory to allocate\n");
00393     break;
00394   case BUFFER:
00395     pc_message(verbosity,2,"     Allocate %d MB of memory, shared equally between all n-gram tables.\n",buffer_size);
00396     break;
00397   case SPECIFIED:
00398     pc_message(verbosity,2,"     Memory requirement specified.\n          ");
00399     for (i=0;i<=ng.n-2;i++) {
00400       pc_message(verbosity,2,"%d-gram : %d     ",i+2,ng.table_sizes[i+1]);
00401     }
00402     pc_message(verbosity,2,"\n");
00403     break;
00404   }
00405   pc_message(verbosity,2,"  Back-off weight storage : \n");
00406   if (ng.four_byte_alphas) {
00407     pc_message(verbosity,2,"     Back-off weights will be stored in four bytes.\n");
00408   }
00409   else {
00410     pc_message(verbosity,2,"     Back-off weights will be stored in two bytes.\n");
00411     pc_message(verbosity,2,"        Minimum back-off weight : %g\n",ng.min_alpha);
00412     pc_message(verbosity,2,"        Maximum back-off weight : %g\n",ng.max_alpha);
00413     pc_message(verbosity,2,"        Maximum number of out of range back-off weights : %d\n",ng.out_of_range_alphas);
00414   }
00415 
00416   pc_report_unk_args(&argc,argv,verbosity);
00417 
00418 
00419   /* Attempt to open all the files that we will need for input and
00420      output. It is better to do it here than to spend a few hours of
00421      CPU processing id-gram counts, only to find that the output path
00422      is invalid. */
00423 
00424   ng.id_gram_fp = rr_iopen(ng.id_gram_filename);
00425 
00426   /* Vocab is read by Roni's function which does the file opening for
00427      us, so no need to do it here. Don't need to worry about time
00428      being lost if file doesn't exist, since vocab is first thing to
00429      be read anyway. */
00430 
00431   if (context_set) {
00432     ng.context_cues_fp = rr_iopen(ng.context_cues_filename);
00433   }
00434 
00435   if (ng.write_arpa) {
00436     ng.arpa_fp = rr_oopen(ng.arpa_filename);
00437   }
00438 
00439   if (ng.write_bin) {
00440     ng.bin_fp = rr_oopen(ng.bin_filename);
00441   }
00442 
00443 
00444   /* --------------- Read in the vocabulary -------------- */
00445 
00446   pc_message(verbosity,2,"Reading vocabulary.\n");
00447 
00448   ng.vocab_ht = sih_create(1000,0.5,2.0,1);
00449 
00450   read_voc(ng.vocab_filename,verbosity,ng.vocab_ht,&ng.vocab,&(ng.vocab_size));
00451   
00452   /* Determine which of the vocabulary words are context cues */
00453 
00454   ng.no_of_ccs = 0;
00455   ng.context_cue = (flag *) rr_calloc(ng.vocab_size+1,sizeof(flag));
00456 
00457   if (context_set) {
00458 
00459     while (fgets (wlist_entry, sizeof (wlist_entry),ng.context_cues_fp)) {
00460       if (strncmp(wlist_entry,"##",2)==0) continue;
00461       sscanf (wlist_entry, "%s ",current_cc);
00462       if (strncmp(wlist_entry,"#",1)==0) {
00463         fprintf(stderr,"\n\n===========================================================\n");
00464         fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00465         fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
00466         fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
00467         fprintf(stderr,     "         (comments must start with '##')\n");
00468         fprintf(stderr,"===========================================================\n\n");
00469       }
00470       
00471 
00472       if (sih_lookup(ng.vocab_ht,current_cc,&current_cc_id) == 0) {
00473         pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
00474       }
00475       else {
00476         ng.context_cue[(unsigned short) current_cc_id] = 1;
00477         pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
00478         ng.no_of_ccs++;
00479       }
00480     }
00481     rr_iclose(ng.context_cues_fp);
00482   }
00483 
00484   if ((sih_lookup(ng.vocab_ht,"<s>",&test_cc_id) != 0)) {
00485     if (ng.context_cue[(unsigned short) test_cc_id] == 0) {
00486       fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");
00487     }
00488   }
00489 
00490   if ((sih_lookup(ng.vocab_ht,"<p>",&test_cc_id) != 0)) {
00491     if (ng.context_cue[(unsigned short) test_cc_id] == 0) {
00492       fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");
00493     }
00494   }
00495 
00496   if ((sih_lookup(ng.vocab_ht,"<art>",&test_cc_id) != 0)) {
00497     if (ng.context_cue[(unsigned short) test_cc_id] == 0) {
00498       fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");
00499     }
00500   }
00501        
00502                      
00503   /* Allocate space for the table_size array */
00504 
00505   if (ng.n>1) {
00506 
00507     switch(mem_alloc_method) {
00508 
00509     case TWO_PASSES: 
00510       ng.table_sizes = (table_size_t *) rr_calloc(ng.n,sizeof(table_size_t));
00511       pc_message(verbosity,2,"Calculating memory requirement.\n");
00512       calc_mem_req(&ng,is_ascii);
00513       break;
00514     case BUFFER:
00515       ng.table_sizes = (table_size_t *) rr_malloc(ng.n*sizeof(table_size_t));
00516       middle_size = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
00517         sizeof(index__t) + sizeof(id__t);
00518       end_size = sizeof(count_ind_t) + sizeof(id__t);
00519       if (ng.four_byte_alphas) {
00520         middle_size += 2;
00521       }
00522       if (ng.four_byte_counts) {
00523         middle_size += 2;
00524         end_size += 2;
00525       }
00526    
00527 
00528       guess_mem(buffer_size,
00529                 middle_size,
00530                 end_size,
00531                 ng.n,
00532                 ng.table_sizes,
00533                 verbosity);
00534       break;
00535     case SPECIFIED:
00536       break;
00537 
00538     }
00539   
00540   }
00541   else {
00542 
00543     ng.table_sizes = (table_size_t *) rr_calloc(1,sizeof(table_size_t));
00544 
00545   }
00546 
00547 
00548 
00549   ng.table_sizes[0] = ng.vocab_size+1;
00550 
00551   /* ----------- Allocate memory for tree structure -------------- */
00552 
00553   ng.count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng.n);
00554   ng.count4 = (int **) rr_malloc(sizeof(int *)*ng.n);
00555 
00556 
00557   ng.count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng.n);
00558 
00559   if (!ng.four_byte_counts) {
00560     for (i=0;i<=ng.n-1;i++) {
00561       ng.count_table[i] = (count_t *) rr_calloc(ng.count_table_size,
00562                                                 sizeof(count_t));
00563     }
00564     ng.marg_counts = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*
00565                                                ng.table_sizes[0]);
00566   }
00567   else {
00568     ng.marg_counts4 = (int *) rr_malloc(sizeof(int)*ng.table_sizes[0]);
00569   }
00570 
00571   ng.word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng.n);
00572   if (ng.four_byte_alphas) {
00573     ng.bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng.n);
00574   }
00575   else {
00576     ng.bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng.n);
00577   }
00578 
00579   ng.ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng.n);
00580 
00581   /* First table */
00582 
00583   if (ng.four_byte_counts) {
00584     ng.count4[0] = (int *) rr_calloc(ng.table_sizes[0],sizeof(int));
00585   }
00586   else {
00587     ng.count[0] = (count_ind_t *) rr_calloc(ng.table_sizes[0],
00588                                             sizeof(count_ind_t));
00589   }
00590   ng.uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
00591                                            ng.table_sizes[0]);
00592   ng.uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
00593                                                ng.table_sizes[0]);
00594   if (ng.four_byte_alphas) {
00595     ng.bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
00596                                                 ng.table_sizes[0]);
00597   }
00598   else {
00599     ng.bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
00600                                                 ng.table_sizes[0]);
00601   }
00602 
00603   if (ng.n >=2 ) {
00604     ng.ind[0] = (index__t *) rr_calloc(ng.table_sizes[0],sizeof(index__t));
00605   }
00606 
00607   for (i=1;i<=ng.n-2;i++) {
00608     
00609     ng.word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng.table_sizes[i]);
00610     if (ng.four_byte_counts) {
00611       ng.count4[i] = (int *) rr_malloc(sizeof(int)*ng.table_sizes[i]);
00612     }
00613     else {
00614       ng.count[i] = (count_ind_t *) 
00615         rr_malloc(sizeof(count_ind_t)*ng.table_sizes[i]);
00616     }
00617     if (ng.four_byte_alphas) {
00618       ng.bo_weight4[i] = (four_byte_t *) 
00619         rr_malloc(sizeof(four_byte_t)*ng.table_sizes[i]);
00620     }
00621     else {
00622       ng.bo_weight[i] = (bo_weight_t *) 
00623         rr_malloc(sizeof(bo_weight_t)*ng.table_sizes[i]);
00624     }
00625     
00626     ng.ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng.table_sizes[i]);
00627 
00628     mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
00629                 sizeof(index__t) + sizeof(id__t);
00630     
00631     if (ng.four_byte_alphas) {
00632       mem_alloced += 2;
00633     }
00634 
00635     if (ng.four_byte_counts) {
00636       mem_alloced += 2;
00637     }
00638    
00639     mem_alloced *= ng.table_sizes[i];
00640     
00641     pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
00642                mem_alloced,i+1);
00643     
00644   }
00645 
00646   ng.word_id[ng.n-1] = (id__t *) 
00647     rr_malloc(sizeof(id__t)*ng.table_sizes[ng.n-1]);
00648   if (ng.four_byte_counts) {
00649     ng.count4[ng.n-1] = (int *) rr_malloc(sizeof(int)*ng.table_sizes[ng.n-1]);
00650     pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
00651                (sizeof(int) + 
00652                 sizeof(id__t))*ng.table_sizes[ng.n-1],ng.n);
00653     
00654   }
00655   else {
00656     ng.count[ng.n-1] = (count_ind_t *) 
00657       rr_malloc(sizeof(count_ind_t)*ng.table_sizes[ng.n-1]);
00658     pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
00659                (sizeof(count_ind_t) + 
00660                 sizeof(id__t))*ng.table_sizes[ng.n-1],ng.n);
00661 
00662   }
00663 
00664 
00665 
00666   
00667   /* Allocate memory for table for first-byte of indices */
00668 
00669   ng.ptr_table = (int **) rr_malloc(sizeof(int *)*ng.n);
00670   ng.ptr_table_size = (unsigned short *) 
00671     rr_calloc(ng.n,sizeof(unsigned short));
00672   for (i=0;i<=ng.n-1;i++) {
00673     ng.ptr_table[i] = (int *) rr_calloc(65535,sizeof(int));
00674   }
00675 
00676   /* Allocate memory for alpha array */
00677 
00678   ng.alpha_array = (double *) rr_malloc(sizeof(double)*ng.out_of_range_alphas);
00679   ng.size_of_alpha_array = 0;
00680 
00681   /* Allocate memory for frequency of frequency information */
00682 
00683   ng.freq_of_freq = (int **) rr_malloc(sizeof(int *)*ng.n);
00684 
00685   switch(ng.discounting_method) {
00686   case LINEAR:
00687     for (i=0;i<=ng.n-1;i++) {
00688       ng.freq_of_freq[i] = (int *) rr_calloc(2,sizeof(int));
00689     }
00690     break;
00691   case GOOD_TURING:
00692     for (i=0;i<=ng.n-1;i++) {
00693       ng.freq_of_freq[i] = (int *) rr_calloc(ng.fof_size[i]+1,sizeof(int));
00694     }
00695     break;
00696   case ABSOLUTE:
00697     for (i=0;i<=ng.n-1;i++) {
00698       ng.freq_of_freq[i] = (int *) rr_calloc(3,sizeof(int));
00699     }
00700     ng.abs_disc_const = (double *) rr_malloc(sizeof(double)*ng.n);
00701 
00702     break;
00703   case WITTEN_BELL:
00704     ng.freq_of_freq[0] = (int *) rr_calloc(1,sizeof(int));
00705     break;
00706 
00707   }
00708 
00709 
00710   /* Read n-grams into the tree */
00711 
00712 
00713   pc_message(verbosity,2,"Processing id n-gram file.\n");
00714   pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
00715 
00716   /* Allocate space for ngrams id arrays */
00717 
00718   current_ngram.id_array = (id__t *) rr_calloc(ng.n,sizeof(id__t));
00719   previous_ngram.id_array = (id__t *) rr_calloc(ng.n,sizeof(id__t));
00720   current_ngram.n = ng.n;
00721   previous_ngram.n = ng.n;
00722   
00723   ng.num_kgrams = (int *) rr_calloc(ng.n,sizeof(int));
00724   ng_count = (count_t *) rr_calloc(ng.n,sizeof(count_t));
00725   nlines = 1;
00726   ng.n_unigrams = 0;
00727   
00728   /* Process first n-gram */
00729   
00730   get_ngram(ng.id_gram_fp,&current_ngram,is_ascii);
00731 
00732   contains_unks = 0;
00733   for (i=0;i<=ng.n-1;i++) {
00734     if (current_ngram.id_array[i] == 0) {
00735       contains_unks = 1;
00736     }
00737   }
00738 
00739   while (ng.vocab_type == CLOSED_VOCAB && contains_unks){
00740 
00741     get_ngram(ng.id_gram_fp,&current_ngram,is_ascii);
00742     contains_unks = 0;
00743     for (i=0;i<=ng.n-1;i++) {
00744       if (current_ngram.id_array[i] == 0) {
00745         contains_unks = 1;
00746       }
00747     }
00748   }
00749 
00750   for (i=0;i<=ng.n-2;i++) {
00751     ng.ind[i][0] = new_index(0,ng.ptr_table[i],&(ng.ptr_table_size[i]),0);
00752     ng.word_id[i+1][0] = current_ngram.id_array[i+1];
00753     ng.num_kgrams[i+1]++;
00754     ng_count[i] = current_ngram.count;
00755   }
00756   ng_count[0] = current_ngram.count;
00757   
00758   if (ng.discounting_method == GOOD_TURING && 
00759       current_ngram.count <= ng.fof_size[ng.n-1]) {
00760     
00761     if (current_ngram.count <= 0) {
00762       quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00763     }
00764 
00765     ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00766   }
00767   
00768   if (ng.discounting_method == LINEAR && current_ngram.count == 1) {
00769     ng.freq_of_freq[ng.n-1][1]++;
00770   }
00771           
00772   if (ng.discounting_method == ABSOLUTE && current_ngram.count <= 2) {
00773 
00774     if (current_ngram.count <= 0) {
00775       quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00776     }
00777 
00778     ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00779   }
00780 
00781     store_count(ng.four_byte_counts,
00782                 ng.count_table[ng.n-1],
00783                 ng.count_table_size,
00784                 ng.count[ng.n-1],
00785                 ng.count4[ng.n-1],
00786                 0,
00787                 current_ngram.count); 
00788   
00789   if (current_ngram.count <= ng.cutoffs[ng.n-2]) {
00790     ng.num_kgrams[ng.n-1]--;
00791   }
00792   prev_id1 = current_ngram.id_array[0];
00793     
00794   displayed_oov_warning = 0;
00795 
00796   for (i=0;i<=ng.n-1;i++) {
00797     previous_ngram.id_array[i] = current_ngram.id_array[i];
00798   }
00799   previous_ngram.count = current_ngram.count;
00800 
00801 
00802   while (!rr_feof(ng.id_gram_fp)) {
00803 
00804 
00805     if (get_ngram(ng.id_gram_fp,&current_ngram,is_ascii)) {
00806     
00807       if (ng.vocab_type == CLOSED_VOCAB) {
00808         contains_unks = 0;
00809         for (i=0;i<=ng.n-1;i++) {
00810           if (current_ngram.id_array[i] == 0) {
00811             contains_unks = 1;
00812           }
00813         }
00814       }
00815     
00816       if (!contains_unks || ng.vocab_type != CLOSED_VOCAB) {
00817 
00818 
00819         
00820 
00821   
00822         /* Test for where this ngram differs from last - do we have an
00823            out-of-order ngram? */
00824       
00825         pos_of_novelty = ng.n;
00826 
00827         for (i=0;i<=ng.n-1;i++) {
00828           if (current_ngram.id_array[i] > previous_ngram.id_array[i]) {
00829             pos_of_novelty = i;
00830             i=ng.n;
00831           }
00832           else {
00833             if (current_ngram.id_array[i] < previous_ngram.id_array[i]) {
00834               if (nlines < 5) { /* Error ocurred early - file format? */
00835                 quit(-1,"Error : n-gram ordering problem - could be due to using wrong file format.\nCheck whether id n-gram file is in ascii or binary format.\n");
00836               }
00837               else {
00838                 quit(-1,"Error : n-grams are not correctly ordered. Error occurred at ngram %d.\n",nlines);
00839               }
00840             }
00841           }
00842         }
00843 
00844         if (pos_of_novelty == ng.n) {
00845           if (nlines > 3) {
00846             quit(-1,"Error - same n-gram appears twice in idngram stream.\n");
00847           }
00848           else {
00849             quit(-1,"Error in the idngram stream. It appears that the same n-gram occurs twice\n in the stream. Check that text2idngram exited successfully, and the \nformat (binary/ascii) of the idngram file.\n");
00850           }
00851         }
00852     
00853         nlines++;
00854     
00855         if (nlines % 20000 == 0) {
00856           if (nlines % 1000000 == 0) {
00857             pc_message(verbosity,2,".\n");
00858           }
00859           else {
00860             pc_message(verbosity,2,".");
00861           }
00862         }
00863     
00864         /* Add new n-gram as soon as it is encountered */
00865     
00866         /* If all of the positions 2,3,...,n of the n-gram are context
00867            cues then ignore the n-gram. */
00868     
00869         if (ng.n > 1) {
00870       
00871           store_count(ng.four_byte_counts,
00872                       ng.count_table[ng.n-1],
00873                       ng.count_table_size,
00874                       ng.count[ng.n-1],
00875                       ng.count4[ng.n-1],
00876                       ng.num_kgrams[ng.n-1],
00877                       current_ngram.count);
00878 
00879           
00880           if (ng.discounting_method == GOOD_TURING && 
00881               current_ngram.count <= ng.fof_size[ng.n-1]) {
00882 
00883             if (current_ngram.count <= 0) {
00884               quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00885             }
00886             
00887             ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00888           }
00889           
00890           if (ng.discounting_method == LINEAR && current_ngram.count == 1) {
00891             ng.freq_of_freq[ng.n-1][1]++;
00892           }
00893           
00894           if (ng.discounting_method == ABSOLUTE && current_ngram.count <= 2) {
00895 
00896             if (current_ngram.count <= 0) {
00897               quit(-1,"Error in idngram stream. This is most likely to be caused by trying to read\na gzipped file as if it were uncompressed. Ensure that all gzipped files have\na .gz extension. Other causes might be confusion over whether the file is in\nascii or binary format.\n");
00898             }
00899             
00900             ng.freq_of_freq[ng.n-1][current_ngram.count]++;
00901           }
00902           
00903           ng.word_id[ng.n-1][ng.num_kgrams[ng.n-1]] = 
00904             current_ngram.id_array[ng.n-1];
00905           
00906           ng.num_kgrams[ng.n-1]++;
00907           
00908           
00909           if (ng.num_kgrams[ng.n-1] >= ng.table_sizes[ng.n-1]) {
00910             quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng.table_sizes[ng.n-1],ng.n);
00911           }
00912 
00913         }
00914         /* Deal with new 2,3,...,(n-1)-grams */
00915       
00916         for (i=ng.n-2;i>=MAX(1,pos_of_novelty);i--) {
00917 
00918           if (ng.discounting_method == GOOD_TURING && 
00919               ng_count[i] <= ng.fof_size[i]) {
00920             ng.freq_of_freq[i][ng_count[i]]++;
00921           }
00922 
00923           if (ng.discounting_method == LINEAR && ng_count[i] == 1) {
00924             ng.freq_of_freq[i][1]++;
00925           }
00926 
00927           if (ng.discounting_method == ABSOLUTE && ng_count[i] <= 2) {
00928             ng.freq_of_freq[i][ng_count[i]]++;
00929           }
00930           
00931           if (ng_count[i] <= ng.cutoffs[i-1]) {
00932             ng.num_kgrams[i]--;
00933           }
00934           else {
00935             store_count(ng.four_byte_counts,
00936                         ng.count_table[i],
00937                         ng.count_table_size,
00938                         ng.count[i],
00939                         ng.count4[i],
00940                         ng.num_kgrams[i]-1,
00941                         ng_count[i]);
00942           }
00943           ng_count[i] = current_ngram.count;
00944           ng.word_id[i][ng.num_kgrams[i]] = current_ngram.id_array[i];
00945           ng.ind[i][ng.num_kgrams[i]] = new_index(ng.num_kgrams[i+1]-1,
00946                                                   ng.ptr_table[i],
00947                                                   &(ng.ptr_table_size[i]),
00948                                                   ng.num_kgrams[i]);
00949 
00950           ng.num_kgrams[i]++;
00951         
00952           if (ng.num_kgrams[i] >= ng.table_sizes[i]) {
00953             quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng.table_sizes[i],i+1);
00954           }
00955   
00956         }
00957 
00958         /* this was original place - messes up for bigram models */
00959 
00960         /*      if (current_ngram.count <= ng.cutoffs[ng.n-2]) {
00961                 ng.num_kgrams[ng.n-1]--;
00962                 } */
00963       
00964         for (i=0;i<=pos_of_novelty-1;i++) {
00965           ng_count[i] += current_ngram.count;
00966         }
00967       
00968         /* Deal with new 1-grams */
00969       
00970         if (pos_of_novelty == 0) {
00971           if (ng.n>1) {
00972 
00973             for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
00974               ng.ind[0][i] = new_index(ng.num_kgrams[1]-1,
00975                                        ng.ptr_table[0],
00976                                        &(ng.ptr_table_size[0]),
00977                                        i);
00978             }
00979             prev_id1 = current_ngram.id_array[0];
00980 
00981           }
00982 
00983           if (ng.discounting_method == GOOD_TURING && 
00984               ng_count[0] <= ng.fof_size[0]) {
00985             ng.freq_of_freq[0][ng_count[0]]++;
00986           }
00987 
00988           if (ng.discounting_method == LINEAR && ng_count[0] == 1) {
00989             ng.freq_of_freq[0][1]++;
00990           }
00991 
00992           if (ng.discounting_method == ABSOLUTE && ng_count[0] <= 2) {
00993             ng.freq_of_freq[0][ng_count[0]]++;
00994           }
00995 
00996           if (!ng.context_cue[previous_ngram.id_array[0]]) {
00997             ng.n_unigrams += ng_count[0];
00998 
00999             store_count(ng.four_byte_counts,
01000                         ng.count_table[0],
01001                         ng.count_table_size,
01002                         ng.count[0],
01003                         ng.count4[0],
01004                         previous_ngram.id_array[0],
01005                         ng_count[0]); 
01006 
01007           }
01008 
01009           store_count(ng.four_byte_counts,
01010                       ng.count_table[0],
01011                       ng.count_table_size,
01012                       ng.marg_counts,
01013                       ng.marg_counts4,
01014                       previous_ngram.id_array[0],
01015                       ng_count[0]);
01016                       
01017           ng_count[0] = current_ngram.count;
01018         }
01019 
01020         if (current_ngram.count <= ng.cutoffs[ng.n-2]) {
01021           ng.num_kgrams[ng.n-1]--;
01022         }
01023 
01024         for (i=0;i<=ng.n-1;i++) {
01025           previous_ngram.id_array[i] = current_ngram.id_array[i];
01026         }
01027         previous_ngram.count = current_ngram.count;
01028         
01029 
01030       }
01031       else {
01032         if (!displayed_oov_warning){
01033           pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
01034           displayed_oov_warning = 1;
01035         }
01036       }
01037     }
01038   }
01039 
01040   rr_iclose(ng.id_gram_fp);
01041 
01042   for (i=ng.n-2;i>=1;i--) {
01043     if (ng.discounting_method == GOOD_TURING && 
01044         ng_count[i] <= ng.fof_size[i]) {
01045       ng.freq_of_freq[i][ng_count[i]]++;
01046     }
01047     
01048     if (ng.discounting_method == LINEAR && ng_count[i] == 1) {
01049       ng.freq_of_freq[i][1]++;
01050     }
01051 
01052     if (ng.discounting_method == ABSOLUTE && ng_count[i] <= 2) {
01053       ng.freq_of_freq[i][ng_count[i]]++;
01054     }
01055 
01056     if (ng_count[i] <= ng.cutoffs[i-1]) {
01057       ng.num_kgrams[i]--;
01058     }
01059     else {
01060 
01061       store_count(ng.four_byte_counts,
01062                   ng.count_table[i],
01063                   ng.count_table_size,
01064                   ng.count[i],
01065                   ng.count4[i],
01066                   ng.num_kgrams[i]-1,
01067                   ng_count[i]);
01068 
01069     }
01070   }
01071   
01072   if (ng.discounting_method == GOOD_TURING && ng_count[0] <= ng.fof_size[0]) {
01073     ng.freq_of_freq[0][ng_count[0]]++;
01074   }
01075 
01076 
01077   if (ng.discounting_method == LINEAR && ng_count[0] == 1) {
01078     ng.freq_of_freq[0][1]++;
01079   }
01080 
01081   if (ng.discounting_method == ABSOLUTE && ng_count[0] <= 2) {
01082     ng.freq_of_freq[0][ng_count[0]]++;
01083   }
01084 
01085   if (!ng.context_cue[current_ngram.id_array[0]]) {
01086     ng.n_unigrams += ng_count[0];
01087 
01088     store_count(ng.four_byte_counts,
01089                 ng.count_table[0],
01090                 ng.count_table_size,
01091                 ng.count[0],
01092                 ng.count4[0],
01093                 current_ngram.id_array[0],
01094                 ng_count[0]);
01095     
01096   }
01097 
01098   store_count(ng.four_byte_counts,
01099               ng.count_table[0],
01100               ng.count_table_size,
01101               ng.marg_counts,
01102               ng.marg_counts4,
01103               current_ngram.id_array[0],
01104               ng_count[0]);
01105 
01106   if (ng.n>1) {
01107 
01108     for (i=current_ngram.id_array[0]+1;i<=ng.vocab_size;i++) {
01109       ng.ind[0][i] = new_index(ng.num_kgrams[1],
01110                                ng.ptr_table[0],
01111                                &(ng.ptr_table_size[0]),
01112                                current_ngram.id_array[0]);
01113     }
01114   }
01115 
01116   pc_message(verbosity,2,"\n");
01117 
01118   /* Impose a minimum unigram count, if required */
01119 
01120   if (ng.min_unicount > 0) {
01121 
01122     int nchanged;
01123     
01124     nchanged = 0;
01125 
01126     for (i=ng.first_id;i<=ng.vocab_size;i++) {
01127       if ((return_count(ng.four_byte_counts,
01128                         ng.count_table[0],
01129                         ng.count[0],
01130                         ng.count4[0],
01131                         i) < ng.min_unicount) && !ng.context_cue[i]) {
01132 
01133         switch(ng.discounting_method) {
01134         case LINEAR:
01135           if (ng.count[0][i] <= 1) {
01136             ng.freq_of_freq[0][ng.count[0][i]]--;
01137           }
01138           break;
01139         case ABSOLUTE:
01140           if (ng.count[0][i] <= 2) {
01141             ng.freq_of_freq[0][ng.count[0][i]]--;
01142           }
01143         case GOOD_TURING:
01144           if (ng.count[0][i] <= ng.fof_size[0]) {
01145             ng.freq_of_freq[0][ng.count[0][i]]--;
01146           }
01147           break;
01148         case WITTEN_BELL:
01149           if (ng.count[0][i] == 0) {
01150             ng.freq_of_freq[0][ng.count[0][i]]--;
01151           }
01152           break;
01153         }
01154         ng.n_unigrams += (ng.min_unicount - ng.count[0][i]);
01155 
01156         store_count(ng.four_byte_counts,
01157                     ng.count_table[0],
01158                     ng.count_table_size,
01159                     ng.count[0],
01160                     ng.count4[0],
01161                     i,
01162                     ng.min_unicount);
01163 
01164         nchanged++;
01165       }
01166     }
01167 
01168     if (nchanged > 0) {
01169       pc_message(verbosity,2,
01170                  "Unigram counts of %d words were bumped up to %d.\n",
01171                  nchanged,ng.min_unicount);
01172     }
01173 
01174   }
01175 
01176   /* Count zeroton information for unigrams */
01177 
01178   ng.freq_of_freq[0][0] = 0;
01179   
01180   for (i=ng.first_id;i<=ng.vocab_size;i++) {
01181     if (return_count(ng.four_byte_counts,
01182                      ng.count_table[0],
01183                      ng.count[0],
01184                      ng.count4[0],
01185                      i) == 0) {
01186       ng.freq_of_freq[0][0]++;
01187     }
01188   }
01189   
01190 
01191   if (ng.discounting_method == GOOD_TURING) {
01192     for (i=0;i<=ng.n-1;i++) {
01193       for (j=1;j<=ng.fof_size[i];j++) {
01194         pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng.freq_of_freq[i][j]);
01195       }
01196     }
01197   }
01198 
01199   
01200 
01201 
01202   /* Calculate discounted counts */
01203 
01204   pc_message(verbosity,2,"Calculating discounted counts.\n");
01205 
01206   switch(ng.discounting_method) {
01207   case GOOD_TURING:
01208 
01209     ng.gt_disc_ratio = (disc_val_t **) rr_malloc(sizeof(disc_val_t *)*ng.n);
01210     
01211     for (i=0;i<=ng.n-1;i++) {
01212       ng.gt_disc_ratio[i] = (disc_val_t *) 
01213         rr_malloc(sizeof(disc_val_t)*ng.fof_size[i]);
01214     }
01215     
01216     for (i=0;i<=ng.n-1;i++) {
01217       if (i==0) {
01218         compute_gt_discount(i+1,
01219                             ng.freq_of_freq[0],
01220                             ng.fof_size[0],
01221                             &ng.disc_range[0],
01222                             0,
01223                             verbosity,
01224                             &ng.gt_disc_ratio[0]);
01225       }
01226       else {
01227         compute_gt_discount(i+1,
01228                             ng.freq_of_freq[i],
01229                             ng.fof_size[i],
01230                             &ng.disc_range[i],
01231                             ng.cutoffs[i-1],
01232                             verbosity,
01233                             &ng.gt_disc_ratio[i]);
01234       }
01235     }
01236     break;
01237   case WITTEN_BELL:
01238     break;
01239   case LINEAR:
01240     ng.lin_disc_ratio = (disc_val_t *) rr_malloc(sizeof(disc_val_t)*ng.n);
01241     pc_message(verbosity,1,"Linear discounting ratios :\n");
01242     for (i=0;i<=ng.n-1;i++) {
01243       ng.lin_disc_ratio[i] = 1 - ( (float) ng.freq_of_freq[i][1]/
01244                                    (float) ng.n_unigrams);
01245       pc_message(verbosity,1,"%d-gram : %g\n",i+1,ng.lin_disc_ratio[i]);
01246     }
01247 
01248     break;
01249   case ABSOLUTE:
01250     pc_message(verbosity,1,"Absolute discounting ratios :\n");
01251     for (i=0;i<=ng.n-1;i++) {
01252       ng.abs_disc_const[i] = ((float) ng.freq_of_freq[i][1] ) /
01253         ((float) ng.freq_of_freq[i][1] + (2*ng.freq_of_freq[i][2]) );
01254       pc_message(verbosity,1,"%d-gram : ",i+1);
01255       for (j=1;j<=5;j++) {
01256         pc_message(verbosity,1,"%g ",(j-ng.abs_disc_const[i])/j);
01257       }
01258       pc_message(verbosity,1," ... \n");
01259     }
01260     break;
01261   }
01262      
01263      
01264   /* Smooth unigram distribution, to give some mass to zerotons */
01265      
01266   compute_unigram(&ng,verbosity);
01267 
01268   /* Increment Contexts if using Good-Turing discounting. No need otherwise,
01269      since all values are discounted anyway. */
01270 
01271   if (ng.discounting_method == GOOD_TURING) {
01272     pc_message(verbosity,2,"Incrementing contexts...\n");  
01273 
01274     for (i=ng.n-1;i>=1;i--) {
01275       
01276       increment_context(&ng,i,verbosity);
01277       
01278     }
01279   }
01280 
01281 
01282   /* Calculate back-off weights */
01283 
01284   pc_message(verbosity,2,"Calculating back-off weights...\n");
01285 
01286   for (i=1;i<=ng.n-1;i++) {
01287     compute_back_off(&ng,i,verbosity);
01288   }
01289 
01290   if (!ng.four_byte_alphas) {
01291     pc_message(verbosity,3,"Number of out of range alphas = %d\n",
01292                ng.size_of_alpha_array);
01293   }
01294 
01295   /* Write out LM */
01296 
01297   pc_message(verbosity,2,"Writing out language model...\n");
01298 
01299   if (ng.write_arpa) {
01300 
01301     write_arpa_lm(&ng,verbosity);
01302 
01303   }
01304 
01305   if (ng.write_bin) {
01306     
01307     write_bin_lm(&ng,verbosity);
01308 
01309   }
01310 
01311   pc_message(verbosity,0,"idngram2lm : Done.\n");
01312 
01313   exit(0);
01314     
01315 }
01316 
01317               

Generated on Tue Dec 21 13:54:45 2004 by doxygen1.2.18