Main Page   Compound List   File List   Compound Members   File Members  

bo_ng_prob.c

Go to the documentation of this file.
00001 /*=====================================================================
00002                 =======   COPYRIGHT NOTICE   =======
00003 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00004 Ronald Rosenfeld and Philip Clarkson.
00005 
00006 All rights reserved.
00007 
00008 This software is made available for research purposes only.  It may be
00009 redistributed freely for this purpose, in full or in part, provided
00010 that this entire copyright notice is included on any copies of this
00011 software and applications and derivations thereof.
00012 
00013 This software is provided on an "as is" basis, without warranty of any
00014 kind, either expressed or implied, as to any matter including, but not
00015 limited to warranty of fitness of purpose, or merchantability, or
00016 results obtained from use of this software.
00017 ======================================================================*/
00018 
00019 
00025 #include "pc_libs/pc_general.h"
00026 #include "rr_libs/general.h"
00027 #include "ngram.h"
00028 #include "idngram2lm.h"
00029 #include <math.h>
00030 #include <stdlib.h>
00031 #include <stdio.h>
00032 
00033 void bo_ng_prob(int context_length,
00034                 id__t *sought_ngram,
00035                 ng_t *ng,
00036                 int verbosity,
00037                 double *p_prob,
00038                 int *bo_case) {
00039 
00040   flag found;
00041   flag found_ngram;
00042   flag found_context;
00043   flag still_found;
00044 
00045   int length_exists;
00046   int ng_begin;
00047   int ng_end;
00048   int ng_middle;
00049   int ncount;
00050   int contcount;
00051   int *ng_index;
00052 
00053   int i;
00054   
00055   int temp_case;
00056 
00057   double alpha;
00058   double prob;
00059   double discounted_ncount;
00060 
00061   /* Initialise variables (unnecessary, but gives warning-free compilation */
00062 
00063   ncount = 0;
00064   contcount = 0;
00065   alpha = 0.0;
00066   discounted_ncount = 0.0;
00067 
00068   ng_index = (int *) rr_malloc((context_length+1)*sizeof(int));
00069 
00070   if (context_length == 0) {
00071 
00072     *p_prob = ng->uni_probs[sought_ngram[0]];
00073     if (*p_prob<= 0.0 || *p_prob >= 1.0) {
00074       pc_message(verbosity,1,"Warning : P( %d ) == %g\n", 
00075                  sought_ngram[0], *p_prob);
00076     }
00077   }
00078 
00079   else {
00080 
00081     found_ngram = 0;
00082     found_context = 0;
00083     ncount = -1;
00084 
00085     /* Find the appropriate (context-length+1)-gram */
00086 
00087     length_exists = 0;
00088     still_found = 1;
00089 
00090     while (still_found && (length_exists < (context_length+1))) {
00091       
00092       found = 0;
00093 
00094       /* Look for (length_exists+1)-gram */
00095 
00096       if (length_exists == 0) {
00097         if (return_count(ng->four_byte_counts,
00098                          ng->count_table[0],
00099                          ng->marg_counts,
00100                          ng->marg_counts4,
00101                          sought_ngram[0]) > 0) {
00102           found = 1;
00103           ng_index[0] = sought_ngram[0];
00104         }
00105       }
00106       else {
00107 
00108         /* Binary search for right ngram */
00109         
00110         ng_begin = 
00111           get_full_index(ng->ind[length_exists-1][ng_index[length_exists-1]],
00112                          ng->ptr_table[length_exists-1],
00113                          ng->ptr_table_size[length_exists-1],
00114                          ng_index[length_exists-1]);
00115 
00116         if (length_exists == 1) {
00117           if (ng_index[0] < ng->vocab_size) {
00118             ng_end = 
00119               get_full_index(ng->ind[length_exists-1][ng_index[length_exists-1]+1],
00120                              ng->ptr_table[length_exists-1],
00121                              ng->ptr_table_size[length_exists-1],
00122                              ng_index[length_exists-1]+1)-1;
00123           }
00124           else {
00125             ng_end = ng->num_kgrams[1];
00126           }
00127         }
00128         else {
00129           if (ng_index[length_exists-1] < ng->num_kgrams[length_exists-1]-1) {
00130             ng_end = 
00131               get_full_index(ng->ind[length_exists-1][ng_index[length_exists-1]+1],
00132                              ng->ptr_table[length_exists-1],
00133                              ng->ptr_table_size[length_exists-1],
00134                              ng_index[length_exists-1]+1)-1;
00135           }
00136           else {
00137             ng_end = ng->num_kgrams[length_exists];
00138           }
00139         }
00140 
00141         while (ng_begin <= ng_end) {
00142           ng_middle = ng_begin + ((ng_end - ng_begin) >> 1);
00143           if (sought_ngram[length_exists] < 
00144               ng->word_id[length_exists][ng_middle]) {
00145             ng_end = ng_middle - 1;
00146           }
00147           else {
00148             if (sought_ngram[length_exists] > 
00149                 ng->word_id[length_exists][ng_middle]) {
00150               ng_begin = ng_middle + 1;
00151             }
00152             else {
00153               found = 1;
00154               ng_index[length_exists] = ng_middle;
00155               break;
00156             }
00157           }
00158         }
00159 
00160       }
00161 
00162       if (found) {
00163         length_exists++;
00164       }
00165       else {
00166         still_found = 0;
00167       }
00168 
00169     }
00170 
00171     if (length_exists == (context_length+1)) {
00172       found_ngram = 1;
00173 
00174       ncount = return_count(ng->four_byte_counts,
00175                             ng->count_table[context_length],
00176                             ng->count[context_length],
00177                             ng->count4[context_length],
00178                             ng_index[context_length]);
00179     }
00180 
00181     if (length_exists >= context_length) {
00182       found_context = 1;
00183       if (context_length == 1) {
00184         contcount = return_count(ng->four_byte_counts,
00185                                  ng->count_table[0],
00186                                  ng->marg_counts,
00187                                  ng->marg_counts4,
00188                                  ng_index[0]);
00189       }
00190       else {
00191         contcount = return_count(ng->four_byte_counts,
00192                                  ng->count_table[context_length-1],
00193                                  ng->count[context_length-1],
00194                                  ng->count4[context_length-1],
00195                                  ng_index[context_length-1]);
00196       }
00197     }
00198     if (found_context) {
00199       if (ng->four_byte_alphas) {
00200         alpha = ng->bo_weight4[context_length-1][ng_index[context_length-1]];
00201       }
00202       else {
00203         alpha = double_alpha(ng->bo_weight[context_length-1][ng_index[context_length-1]],
00204                              ng->alpha_array,
00205                              ng->size_of_alpha_array,
00206                              65535 - ng->out_of_range_alphas,
00207                              ng->min_alpha,
00208                              ng->max_alpha);
00209       }
00210     }
00211 
00212     /* If it occurred then return appropriate prob */
00213 
00214     if (found_ngram) {
00215       switch (ng->discounting_method) {
00216       case GOOD_TURING:
00217         if (ncount <= ng->disc_range[context_length]) {
00218           discounted_ncount = 
00219             ng->gt_disc_ratio[context_length][ncount]*ncount;
00220         }
00221         else {
00222           discounted_ncount = ncount;
00223         }         
00224         break;
00225       case LINEAR:
00226         discounted_ncount = ng->lin_disc_ratio[context_length]*ncount;
00227         break;
00228       case ABSOLUTE:
00229         discounted_ncount = ncount - ng->abs_disc_const[context_length];
00230         break;
00231       case WITTEN_BELL:
00232         discounted_ncount = ( (double) ((double) contcount*ncount))/
00233           ( (double) contcount+ num_of_types(context_length-1,
00234                                              ng_index[context_length-1],ng));
00235         break;
00236       }
00237 
00238       prob = discounted_ncount / (double) contcount;
00239       temp_case = 0;
00240 
00241       if (prob <= 0.0 || prob >= 1.0) {
00242         pc_message(verbosity,1,"Warning : P(%d) = %g (%g / %d)\n",
00243                    sought_ngram[0],prob, discounted_ncount,contcount);
00244         pc_message(verbosity,1,"ncount = %d\n",ncount);
00245       }
00246 
00247     }
00248     
00249     else {
00250 
00251       bo_ng_prob(context_length-1,
00252                  &sought_ngram[1],
00253                  ng,
00254                  verbosity,
00255                  &prob,
00256                  bo_case);
00257 
00258       temp_case = 2;
00259 
00260       if (found_context) {
00261         prob*=alpha;
00262         temp_case=1;
00263       }
00264       
00265       
00266 
00267     }
00268     
00269 
00270     *p_prob = prob;
00271     *bo_case += temp_case * (1 << (2*(context_length-1)));
00272     
00273   
00274 
00275   }
00276 
00277   if (*p_prob > 1.0) {
00278     fprintf(stderr,"Error : P( %d | ",sought_ngram[context_length]);
00279     for (i=0;i<=context_length-1;i++) {
00280       fprintf(stderr,"%d ",sought_ngram[i]);
00281     }
00282     fprintf(stderr,") = %g\n",*p_prob);
00283     exit(1);
00284   }
00285   free(ng_index);
00286 
00287 }

Generated on Tue Dec 21 13:54:44 2004 by doxygen1.2.18