00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00027 #include "ngram.h"
00028 #include "idngram2lm.h"
00029 #include "pc_libs/pc_general.h"
00030 #include <stdlib.h>
00031
00032 void compute_back_off(ng_t *ng,int n, int verbosity) {
00033
00034 int *current_pos;
00035 int *end_pos;
00036 id__t *sought_ngram;
00037 int current_table;
00038 int ng_count;
00039 int i;
00040 double sum_cond_prob;
00041 double sum_bo_prob;
00042 double discounted_ngcount;
00043 double cond_prob;
00044 double bo_prob;
00045 double discount_mass;
00046 double leftout_bo_prob;
00047 double alpha;
00048
00049 int bo_case;
00050
00051 sum_cond_prob = 0.0;
00052 sum_bo_prob = 0.0;
00053
00054
00055
00056 discounted_ngcount = 0.0;
00057
00058 current_pos = (int *)rr_calloc(n+1,sizeof(int));
00059 sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t));
00060 end_pos = (int *)rr_calloc(n+1,sizeof(int));
00061
00062
00063
00064
00065 for (current_pos[0]=ng->first_id;
00066 current_pos[0]<=ng->vocab_size;
00067 current_pos[0]++) {
00068
00069 if (return_count(ng->four_byte_counts,
00070 ng->count_table[0],
00071 ng->marg_counts,
00072 ng->marg_counts4,
00073 current_pos[0]) > 0) {
00074
00075 current_table = 1;
00076
00077 if (current_pos[0] == ng->vocab_size) {
00078 end_pos[1] = ng->num_kgrams[1]-1;
00079 }
00080 else {
00081 end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
00082 ng->ptr_table[0],
00083 ng->ptr_table_size[0],
00084 current_pos[0]+1)-1;
00085 }
00086
00087 while (current_table > 0) {
00088
00089 if (current_table == n) {
00090
00091 if (current_pos[n] <= end_pos[n]){
00092
00093 ng_count = return_count(ng->four_byte_counts,
00094 ng->count_table[n],
00095 ng->count[n],
00096 ng->count4[n],
00097 current_pos[n]);
00098
00099 switch (ng->discounting_method) {
00100 case GOOD_TURING:
00101 if (ng_count <= ng->disc_range[n]) {
00102 discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count;
00103 }
00104 else {
00105 discounted_ngcount = ng_count;
00106 }
00107 break;
00108 case LINEAR:
00109 discounted_ngcount = ng->lin_disc_ratio[n] * ng_count;
00110 break;
00111 case ABSOLUTE:
00112 discounted_ngcount = ng_count - ng->abs_disc_const[n];
00113 break;
00114 case WITTEN_BELL:
00115 if (n==1) {
00116
00117 discounted_ngcount = ((double)
00118 return_count(ng->four_byte_counts,
00119 ng->count_table[0],
00120 ng->marg_counts,
00121 ng->marg_counts4,
00122 current_pos[0]) * ng_count)
00123 / (return_count(ng->four_byte_counts,
00124 ng->count_table[0],
00125 ng->marg_counts,
00126 ng->marg_counts4,
00127 current_pos[0]) +
00128 num_of_types(0,current_pos[0],ng));
00129 }
00130 else {
00131
00132 discounted_ngcount = ((double)
00133 return_count(ng->four_byte_counts,
00134 ng->count_table[n-1],
00135 ng->count[n-1],
00136 ng->count4[n-1],
00137 current_pos[n-1])* ng_count)
00138 / (return_count(ng->four_byte_counts,
00139 ng->count_table[n-1],
00140 ng->count[n-1],
00141 ng->count4[n-1],
00142 current_pos[n-1]) +
00143 num_of_types(n-1,current_pos[n-1],ng));
00144
00145 }
00146
00147 break;
00148 }
00149
00150 if (n==1) {
00151 cond_prob = ((double) discounted_ngcount /
00152 return_count(ng->four_byte_counts,
00153 ng->count_table[0],
00154 ng->marg_counts,
00155 ng->marg_counts4,
00156 current_pos[0]));
00157 }
00158 else {
00159 cond_prob = ((double) discounted_ngcount /
00160 return_count(ng->four_byte_counts,
00161 ng->count_table[n-1],
00162 ng->count[n-1],
00163 ng->count4[n-1],
00164 current_pos[n-1]));
00165
00166 }
00167 sum_cond_prob += cond_prob;
00168
00169
00170
00171 for (i=1;i<=n;i++) {
00172 sought_ngram[i-1] = ng->word_id[i][current_pos[i]];
00173 }
00174
00175
00176 bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case);
00177 sum_bo_prob += bo_prob;
00178 current_pos[n]++;
00179
00180 }
00181 else {
00182
00183 discount_mass = 1.0 - sum_cond_prob;
00184
00185 if (discount_mass < 1e-10) {
00186 discount_mass = 0.0;
00187 pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
00188 ng->vocab[current_pos[0]],current_pos[0]);
00189 for (i=1;i<=n-1;i++) {
00190 pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
00191 }
00192 pc_message(verbosity,2,
00193 "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob);
00194 }
00195
00196 leftout_bo_prob = 1.0 - sum_bo_prob;
00197 if (leftout_bo_prob < 1e-10) {
00198 leftout_bo_prob = 0.0;
00199 }
00200
00201 if (leftout_bo_prob > 0.0) {
00202 alpha = discount_mass / leftout_bo_prob;
00203 }
00204 else {
00205 alpha = 0.0;
00206 pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
00207 ng->vocab[current_pos[0]],current_pos[0]);
00208 for (i=1;i<=n-1;i++) {
00209 pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
00210 }
00211 pc_message(verbosity,2,
00212 "is set to 0.\nMay cause problems with zero probabilities.\n");
00213
00214 }
00215
00216 if (ng->four_byte_alphas) {
00217 ng->bo_weight4[n-1][current_pos[n-1]] = alpha;
00218 }
00219 else {
00220 ng->bo_weight[n-1][current_pos[n-1]] =
00221 short_alpha(alpha,
00222 ng->alpha_array,
00223 &(ng->size_of_alpha_array),
00224 65535 - ng->out_of_range_alphas,
00225 ng->min_alpha,
00226 ng->max_alpha);
00227 }
00228
00229
00230
00231 sum_cond_prob = 0.0;
00232 sum_bo_prob = 0.0;
00233 current_table--;
00234 if (current_table > 0) {
00235 current_pos[current_table]++;
00236 }
00237 }
00238 }
00239 else {
00240
00241 if (current_pos[current_table] <= end_pos[current_table]) {
00242 current_table++;
00243 if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) {
00244 end_pos[current_table] = ng->num_kgrams[current_table]-1;
00245 }
00246 else {
00247 end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1;
00248 }
00249 }
00250 else {
00251 current_table--;
00252 if (current_table > 0) {
00253 current_pos[current_table]++;
00254 }
00255 }
00256 }
00257 }
00258 }
00259
00260
00261
00262 else {
00263 if (n == 1) {
00264 if (ng->four_byte_alphas) {
00265 ng->bo_weight4[0][current_pos[0]] = 1.0;
00266 }
00267 else {
00268 ng->bo_weight[0][current_pos[0]] =
00269 short_alpha(1.0,
00270 ng->alpha_array,
00271 &(ng->size_of_alpha_array),
00272 65535 - ng->out_of_range_alphas,
00273 ng->min_alpha,
00274 ng->max_alpha);
00275 }
00276 }
00277 }
00278 }
00279 free(end_pos);
00280 free(current_pos);
00281 free(sought_ngram);
00282
00283 }
00284
00285
00286
00287
00288
00289
00290