Main Page   Compound List   File List   Compound Members   File Members  

compute_back_off.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00027 #include "ngram.h"
00028 #include "idngram2lm.h"
00029 #include "pc_libs/pc_general.h"
00030 #include <stdlib.h>
00031 
00032 void compute_back_off(ng_t *ng,int n, int verbosity) {
00033 
00034   int *current_pos;
00035   int *end_pos;
00036   id__t *sought_ngram;
00037   int current_table;
00038   int ng_count;
00039   int i;
00040   double sum_cond_prob;
00041   double sum_bo_prob;
00042   double discounted_ngcount;
00043   double cond_prob;
00044   double bo_prob;
00045   double discount_mass;
00046   double leftout_bo_prob;
00047   double alpha;
00048 
00049   int bo_case;
00050 
00051   sum_cond_prob = 0.0;
00052   sum_bo_prob = 0.0;
00053 
00054   /* For the sake of warning-free compilation... */
00055 
00056   discounted_ngcount = 0.0;
00057   
00058   current_pos = (int *)rr_calloc(n+1,sizeof(int));
00059   sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t));
00060   end_pos = (int *)rr_calloc(n+1,sizeof(int)); 
00061   
00062   /* Process the tree so that we get all the n-grams out in the right
00063      order. */
00064   
00065   for (current_pos[0]=ng->first_id;
00066        current_pos[0]<=ng->vocab_size;
00067        current_pos[0]++) {
00068     
00069     if (return_count(ng->four_byte_counts,
00070                      ng->count_table[0],
00071                      ng->marg_counts,
00072                      ng->marg_counts4,
00073                      current_pos[0]) > 0) {
00074 
00075       current_table = 1;
00076       
00077       if (current_pos[0] == ng->vocab_size) {
00078         end_pos[1] = ng->num_kgrams[1]-1;
00079       }
00080       else {
00081         end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
00082                                     ng->ptr_table[0],
00083                                     ng->ptr_table_size[0],
00084                                     current_pos[0]+1)-1;
00085       }
00086 
00087       while (current_table > 0) {
00088 
00089         if (current_table == n) {
00090 
00091           if (current_pos[n] <= end_pos[n]){
00092 
00093             ng_count = return_count(ng->four_byte_counts,
00094                                     ng->count_table[n],
00095                                     ng->count[n],
00096                                     ng->count4[n],
00097                                     current_pos[n]);
00098 
00099             switch (ng->discounting_method) {
00100             case GOOD_TURING:
00101               if (ng_count <= ng->disc_range[n]) {
00102                 discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count;
00103               }
00104               else {
00105                 discounted_ngcount = ng_count;
00106               }
00107               break;
00108             case LINEAR:
00109               discounted_ngcount = ng->lin_disc_ratio[n] * ng_count;
00110               break;
00111             case ABSOLUTE:
00112               discounted_ngcount = ng_count - ng->abs_disc_const[n];
00113               break;
00114             case WITTEN_BELL:
00115               if (n==1) {
00116 
00117                 discounted_ngcount = ((double) 
00118                                       return_count(ng->four_byte_counts,
00119                                                    ng->count_table[0],
00120                                                    ng->marg_counts,
00121                                                    ng->marg_counts4,
00122                                                    current_pos[0]) * ng_count)
00123                   / (return_count(ng->four_byte_counts,
00124                                   ng->count_table[0],
00125                                   ng->marg_counts,
00126                                   ng->marg_counts4,
00127                                   current_pos[0]) + 
00128                      num_of_types(0,current_pos[0],ng));
00129               }
00130               else {
00131                 
00132                 discounted_ngcount = ((double) 
00133                                       return_count(ng->four_byte_counts,
00134                                                    ng->count_table[n-1],
00135                                                    ng->count[n-1],
00136                                                    ng->count4[n-1],
00137                                                    current_pos[n-1])* ng_count)
00138                   / (return_count(ng->four_byte_counts,
00139                                   ng->count_table[n-1],
00140                                   ng->count[n-1],
00141                                   ng->count4[n-1],
00142                                   current_pos[n-1]) + 
00143                      num_of_types(n-1,current_pos[n-1],ng));
00144 
00145               }   
00146               
00147               break;
00148             }
00149 
00150             if (n==1) {
00151               cond_prob = ((double) discounted_ngcount / 
00152                            return_count(ng->four_byte_counts,
00153                                         ng->count_table[0],
00154                                         ng->marg_counts,
00155                                         ng->marg_counts4,
00156                                         current_pos[0]));
00157             }
00158             else {
00159               cond_prob = ((double) discounted_ngcount /  
00160                            return_count(ng->four_byte_counts,
00161                                         ng->count_table[n-1],
00162                                         ng->count[n-1],
00163                                         ng->count4[n-1],
00164                                         current_pos[n-1]));
00165 
00166             }
00167             sum_cond_prob += cond_prob;
00168 
00169             /* Fill up sought ngram array with correct stuff */
00170 
00171             for (i=1;i<=n;i++) {
00172               sought_ngram[i-1] = ng->word_id[i][current_pos[i]];
00173             }
00174 
00175 
00176             bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case);
00177             sum_bo_prob += bo_prob;
00178             current_pos[n]++;                   
00179                                                
00180           }
00181           else {
00182 
00183             discount_mass = 1.0 - sum_cond_prob;
00184 
00185             if (discount_mass < 1e-10) {
00186               discount_mass = 0.0;
00187               pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
00188                          ng->vocab[current_pos[0]],current_pos[0]);
00189               for (i=1;i<=n-1;i++) {
00190                 pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
00191               }
00192               pc_message(verbosity,2,
00193                          "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob);
00194             }
00195 
00196             leftout_bo_prob = 1.0 - sum_bo_prob;
00197             if (leftout_bo_prob < 1e-10) {
00198               leftout_bo_prob = 0.0;
00199             }
00200 
00201             if (leftout_bo_prob > 0.0) {
00202               alpha = discount_mass / leftout_bo_prob;
00203             }
00204             else {
00205               alpha = 0.0;      /* Will not be used. Should happen very rarely. */
00206               pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
00207                          ng->vocab[current_pos[0]],current_pos[0]);
00208               for (i=1;i<=n-1;i++) {
00209                 pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
00210               }
00211               pc_message(verbosity,2,
00212                          "is set to 0.\nMay cause problems with zero probabilities.\n");
00213 
00214             }
00215           
00216             if (ng->four_byte_alphas) {
00217               ng->bo_weight4[n-1][current_pos[n-1]] = alpha;
00218             }
00219             else {
00220               ng->bo_weight[n-1][current_pos[n-1]] = 
00221                 short_alpha(alpha,
00222                             ng->alpha_array,
00223                             &(ng->size_of_alpha_array),
00224                             65535 - ng->out_of_range_alphas,
00225                             ng->min_alpha,
00226                             ng->max_alpha);
00227             }
00228           
00229             /* Finished current (n-1)-gram */
00230 
00231             sum_cond_prob = 0.0;
00232             sum_bo_prob = 0.0;
00233             current_table--;
00234             if (current_table > 0) {
00235               current_pos[current_table]++;
00236             }
00237           }
00238         }
00239         else {
00240 
00241           if (current_pos[current_table] <= end_pos[current_table]) {
00242             current_table++;
00243             if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) {
00244               end_pos[current_table] = ng->num_kgrams[current_table]-1;
00245             }
00246             else {
00247               end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1;
00248             }
00249           }
00250           else {
00251             current_table--;
00252             if (current_table > 0) {
00253               current_pos[current_table]++;
00254             }
00255           }
00256         }
00257       }
00258     }
00259 
00260     /* Now deal with zeroton unigrams */
00261 
00262     else {
00263       if (n == 1) {
00264         if (ng->four_byte_alphas) {
00265           ng->bo_weight4[0][current_pos[0]] = 1.0;
00266         }
00267         else {
00268           ng->bo_weight[0][current_pos[0]] = 
00269             short_alpha(1.0,
00270                         ng->alpha_array,
00271                         &(ng->size_of_alpha_array),
00272                         65535 - ng->out_of_range_alphas,
00273                         ng->min_alpha,
00274                         ng->max_alpha);
00275         }
00276       }
00277     }
00278   }
00279   free(end_pos);
00280   free(current_pos);
00281   free(sought_ngram);
00282   
00283 }
00284 
00285 
00286 
00287 
00288 
00289 
00290 

Generated on Tue Dec 21 13:54:44 2004 by doxygen1.2.18