00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00025 #include "pc_libs/pc_general.h"
00026 #include "rr_libs/general.h"
00027 #include "ngram.h"
00028 #include "idngram2lm.h"
00029 #include <math.h>
00030 #include <stdlib.h>
00031 #include <stdio.h>
00032
00033 void bo_ng_prob(int context_length,
00034 id__t *sought_ngram,
00035 ng_t *ng,
00036 int verbosity,
00037 double *p_prob,
00038 int *bo_case) {
00039
00040 flag found;
00041 flag found_ngram;
00042 flag found_context;
00043 flag still_found;
00044
00045 int length_exists;
00046 int ng_begin;
00047 int ng_end;
00048 int ng_middle;
00049 int ncount;
00050 int contcount;
00051 int *ng_index;
00052
00053 int i;
00054
00055 int temp_case;
00056
00057 double alpha;
00058 double prob;
00059 double discounted_ncount;
00060
00061
00062
00063 ncount = 0;
00064 contcount = 0;
00065 alpha = 0.0;
00066 discounted_ncount = 0.0;
00067
00068 ng_index = (int *) rr_malloc((context_length+1)*sizeof(int));
00069
00070 if (context_length == 0) {
00071
00072 *p_prob = ng->uni_probs[sought_ngram[0]];
00073 if (*p_prob<= 0.0 || *p_prob >= 1.0) {
00074 pc_message(verbosity,1,"Warning : P( %d ) == %g\n",
00075 sought_ngram[0], *p_prob);
00076 }
00077 }
00078
00079 else {
00080
00081 found_ngram = 0;
00082 found_context = 0;
00083 ncount = -1;
00084
00085
00086
00087 length_exists = 0;
00088 still_found = 1;
00089
00090 while (still_found && (length_exists < (context_length+1))) {
00091
00092 found = 0;
00093
00094
00095
00096 if (length_exists == 0) {
00097 if (return_count(ng->four_byte_counts,
00098 ng->count_table[0],
00099 ng->marg_counts,
00100 ng->marg_counts4,
00101 sought_ngram[0]) > 0) {
00102 found = 1;
00103 ng_index[0] = sought_ngram[0];
00104 }
00105 }
00106 else {
00107
00108
00109
00110 ng_begin =
00111 get_full_index(ng->ind[length_exists-1][ng_index[length_exists-1]],
00112 ng->ptr_table[length_exists-1],
00113 ng->ptr_table_size[length_exists-1],
00114 ng_index[length_exists-1]);
00115
00116 if (length_exists == 1) {
00117 if (ng_index[0] < ng->vocab_size) {
00118 ng_end =
00119 get_full_index(ng->ind[length_exists-1][ng_index[length_exists-1]+1],
00120 ng->ptr_table[length_exists-1],
00121 ng->ptr_table_size[length_exists-1],
00122 ng_index[length_exists-1]+1)-1;
00123 }
00124 else {
00125 ng_end = ng->num_kgrams[1];
00126 }
00127 }
00128 else {
00129 if (ng_index[length_exists-1] < ng->num_kgrams[length_exists-1]-1) {
00130 ng_end =
00131 get_full_index(ng->ind[length_exists-1][ng_index[length_exists-1]+1],
00132 ng->ptr_table[length_exists-1],
00133 ng->ptr_table_size[length_exists-1],
00134 ng_index[length_exists-1]+1)-1;
00135 }
00136 else {
00137 ng_end = ng->num_kgrams[length_exists];
00138 }
00139 }
00140
00141 while (ng_begin <= ng_end) {
00142 ng_middle = ng_begin + ((ng_end - ng_begin) >> 1);
00143 if (sought_ngram[length_exists] <
00144 ng->word_id[length_exists][ng_middle]) {
00145 ng_end = ng_middle - 1;
00146 }
00147 else {
00148 if (sought_ngram[length_exists] >
00149 ng->word_id[length_exists][ng_middle]) {
00150 ng_begin = ng_middle + 1;
00151 }
00152 else {
00153 found = 1;
00154 ng_index[length_exists] = ng_middle;
00155 break;
00156 }
00157 }
00158 }
00159
00160 }
00161
00162 if (found) {
00163 length_exists++;
00164 }
00165 else {
00166 still_found = 0;
00167 }
00168
00169 }
00170
00171 if (length_exists == (context_length+1)) {
00172 found_ngram = 1;
00173
00174 ncount = return_count(ng->four_byte_counts,
00175 ng->count_table[context_length],
00176 ng->count[context_length],
00177 ng->count4[context_length],
00178 ng_index[context_length]);
00179 }
00180
00181 if (length_exists >= context_length) {
00182 found_context = 1;
00183 if (context_length == 1) {
00184 contcount = return_count(ng->four_byte_counts,
00185 ng->count_table[0],
00186 ng->marg_counts,
00187 ng->marg_counts4,
00188 ng_index[0]);
00189 }
00190 else {
00191 contcount = return_count(ng->four_byte_counts,
00192 ng->count_table[context_length-1],
00193 ng->count[context_length-1],
00194 ng->count4[context_length-1],
00195 ng_index[context_length-1]);
00196 }
00197 }
00198 if (found_context) {
00199 if (ng->four_byte_alphas) {
00200 alpha = ng->bo_weight4[context_length-1][ng_index[context_length-1]];
00201 }
00202 else {
00203 alpha = double_alpha(ng->bo_weight[context_length-1][ng_index[context_length-1]],
00204 ng->alpha_array,
00205 ng->size_of_alpha_array,
00206 65535 - ng->out_of_range_alphas,
00207 ng->min_alpha,
00208 ng->max_alpha);
00209 }
00210 }
00211
00212
00213
00214 if (found_ngram) {
00215 switch (ng->discounting_method) {
00216 case GOOD_TURING:
00217 if (ncount <= ng->disc_range[context_length]) {
00218 discounted_ncount =
00219 ng->gt_disc_ratio[context_length][ncount]*ncount;
00220 }
00221 else {
00222 discounted_ncount = ncount;
00223 }
00224 break;
00225 case LINEAR:
00226 discounted_ncount = ng->lin_disc_ratio[context_length]*ncount;
00227 break;
00228 case ABSOLUTE:
00229 discounted_ncount = ncount - ng->abs_disc_const[context_length];
00230 break;
00231 case WITTEN_BELL:
00232 discounted_ncount = ( (double) ((double) contcount*ncount))/
00233 ( (double) contcount+ num_of_types(context_length-1,
00234 ng_index[context_length-1],ng));
00235 break;
00236 }
00237
00238 prob = discounted_ncount / (double) contcount;
00239 temp_case = 0;
00240
00241 if (prob <= 0.0 || prob >= 1.0) {
00242 pc_message(verbosity,1,"Warning : P(%d) = %g (%g / %d)\n",
00243 sought_ngram[0],prob, discounted_ncount,contcount);
00244 pc_message(verbosity,1,"ncount = %d\n",ncount);
00245 }
00246
00247 }
00248
00249 else {
00250
00251 bo_ng_prob(context_length-1,
00252 &sought_ngram[1],
00253 ng,
00254 verbosity,
00255 &prob,
00256 bo_case);
00257
00258 temp_case = 2;
00259
00260 if (found_context) {
00261 prob*=alpha;
00262 temp_case=1;
00263 }
00264
00265
00266
00267 }
00268
00269
00270 *p_prob = prob;
00271 *bo_case += temp_case * (1 << (2*(context_length-1)));
00272
00273
00274
00275 }
00276
00277 if (*p_prob > 1.0) {
00278 fprintf(stderr,"Error : P( %d | ",sought_ngram[context_length]);
00279 for (i=0;i<=context_length-1;i++) {
00280 fprintf(stderr,"%d ",sought_ngram[i]);
00281 }
00282 fprintf(stderr,") = %g\n",*p_prob);
00283 exit(1);
00284 }
00285 free(ng_index);
00286
00287 }