00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "evallm.h"
00021 #include <math.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024
00028 void compute_perplexity(ng_t *ng,
00029 arpa_lm_t *arpa_ng,
00030 char *text_stream_filename,
00031 char *probs_stream_filename,
00032 char *annotation_filename,
00033 char *oov_filename,
00034 char *fb_list_filename,
00035 flag backoff_from_unk_inc,
00036 flag backoff_from_unk_exc,
00037 flag backoff_from_ccs_inc,
00038 flag backoff_from_ccs_exc,
00039 flag arpa_lm,
00040 flag include_unks,
00041 double log_base) {
00042
00043 fb_info *fb_list;
00044 FILE *temp_fp;
00045 FILE *text_stream_fp;
00046 FILE *probs_stream_fp;
00047 FILE *annotation_fp;
00048 FILE *oov_fp;
00049 flag out_probs;
00050 flag annotate;
00051 flag out_oovs;
00052 flag found_unk_wrongly;
00053 double prob;
00054 double sum_log_prob;
00055 int total_words;
00056 int excluded_unks;
00057 int excluded_ccs;
00058 char current_word[1000];
00059 char **prev_words;
00060 int current_id;
00061 id__t short_current_id;
00062 id__t *context;
00063 int context_length;
00064 int i;
00065 int bo_case;
00066 int actual_context_length;
00067 int *ngrams_hit;
00068 int n;
00069
00070
00071
00072 probs_stream_fp = NULL;
00073 annotation_fp = NULL;
00074 oov_fp = NULL;
00075
00076 short_current_id = 0;
00077
00078 found_unk_wrongly = 0;
00079
00080 annotate = 0;
00081
00082 bo_case = 0;
00083
00084 if (arpa_lm) {
00085 n = arpa_ng->n;
00086 fb_list = gen_fb_list(arpa_ng->vocab_ht,
00087 arpa_ng->vocab_size,
00088 arpa_ng->vocab,
00089 arpa_ng->context_cue,
00090 backoff_from_unk_inc,
00091 backoff_from_unk_exc,
00092 backoff_from_ccs_inc,
00093 backoff_from_ccs_exc,
00094 fb_list_filename);
00095 }
00096 else {
00097 n = ng->n;
00098 fb_list = gen_fb_list(ng->vocab_ht,
00099 ng->vocab_size,
00100 ng->vocab,
00101 ng->context_cue,
00102 backoff_from_unk_inc,
00103 backoff_from_unk_exc,
00104 backoff_from_ccs_inc,
00105 backoff_from_ccs_exc,
00106 fb_list_filename);
00107 }
00108
00109 ngrams_hit = (int *) rr_calloc(n,sizeof(int));
00110 prev_words = (char **) rr_malloc(sizeof(char *)*n);
00111 for (i=0;i<=n-1;i++) {
00112 prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
00113 }
00114
00115
00116
00117
00118
00119
00120 if (!strcmp(text_stream_filename,"")) {
00121 printf("Error : Must specify a text file. Use the -text switch.\n");
00122 return;
00123 }
00124
00125 if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
00126 printf("Error : Can't open file %s for reading.\n",text_stream_filename);
00127 return;
00128 }
00129
00130 out_probs = strcmp(probs_stream_filename,"");
00131 annotate = strcmp(annotation_filename,"");
00132 out_oovs = strcmp(oov_filename,"");
00133
00134 printf("Computing perplexity of the language model with respect\n");
00135 printf(" to the text %s\n",text_stream_filename);
00136 if (out_probs) {
00137 printf("Probability stream will be written to file %s\n",
00138 probs_stream_filename);
00139 }
00140 if (annotate) {
00141 printf("Annotation will be written to file %s\n",
00142 annotation_filename);
00143 }
00144 if (out_oovs) {
00145 printf("Out of vocabulary words will be written to file %s\n",
00146 oov_filename);
00147 }
00148
00149 if (backoff_from_unk_inc) {
00150 printf("Will force inclusive back-off from OOVs.\n");
00151 }
00152
00153 if (backoff_from_unk_exc) {
00154 printf("Will force exclusive back-off from OOVs.\n");
00155 }
00156
00157 if (backoff_from_ccs_inc) {
00158 printf("Will force inclusive back-off from context cues.\n");
00159 }
00160
00161 if (backoff_from_ccs_exc) {
00162 printf("Will force exclusive back-off from context cues.\n");
00163 }
00164
00165 if (strcmp(fb_list_filename,"")) {
00166 printf("Will force back-off according to the contents of %s\n",
00167 fb_list_filename);
00168 }
00169
00170 if (include_unks) {
00171 printf("Perplexity calculation will include OOVs.\n");
00172 }
00173
00174
00175
00176
00177 if (out_probs && strcmp(probs_stream_filename,"-")) {
00178 if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
00179 printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
00180 return;
00181 }
00182 fclose(temp_fp);
00183 }
00184
00185 if (annotate && strcmp(annotation_filename,"-")) {
00186 if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
00187 printf("Error : Can't open file %s for writing.\n",annotation_filename);
00188 return;
00189 }
00190 fclose(temp_fp);
00191 }
00192
00193 if (out_oovs && strcmp(oov_filename,"-")) {
00194 if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
00195 printf("Error : Can't open file %s for writing.\n",oov_filename);
00196 return;
00197 }
00198 fclose(temp_fp);
00199 }
00200
00201 text_stream_fp = rr_iopen(text_stream_filename);
00202 if (out_probs) {
00203 probs_stream_fp = rr_oopen(probs_stream_filename);
00204 }
00205
00206 if (annotate) {
00207 annotation_fp = rr_oopen(annotation_filename);
00208 }
00209
00210 if (out_oovs) {
00211 oov_fp = rr_oopen(oov_filename);
00212 }
00213
00214 context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));
00215
00216 sum_log_prob = 0.0;
00217 total_words = 0;
00218 excluded_unks = 0;
00219 excluded_ccs = 0;
00220
00221 while (!rr_feof(text_stream_fp)) {
00222
00223 if (total_words > 0) {
00224 if (total_words < n) {
00225 strcpy(prev_words[total_words-1],current_word);
00226 }
00227 else {
00228 for (i=0;i<=n-3;i++) {
00229 strcpy(prev_words[i],prev_words[i+1]);
00230 }
00231 if (n>1) {
00232 strcpy(prev_words[n-2],current_word);
00233 }
00234 }
00235 }
00236
00237 if (total_words < (n-1)) {
00238 context_length = total_words;
00239 }
00240 else {
00241 context_length = n-1;
00242 }
00243
00244
00245
00246 if (total_words > (n-1)) {
00247
00248 for (i=0;i<=context_length-2;i++) {
00249 context[i] = context[i+1];
00250 }
00251
00252 }
00253
00254 if (context_length != 0){
00255 context[context_length-1] = short_current_id;
00256 }
00257
00258 if (fscanf(text_stream_fp,"%s",current_word) != 1) {
00259 if (!rr_feof(text_stream_fp)) {
00260 printf("Error reading text file.\n");
00261 return;
00262 }
00263 }
00264
00265 if (!rr_feof(text_stream_fp)) {
00266
00267 if (arpa_lm) {
00268 sih_lookup(arpa_ng->vocab_ht,current_word,¤t_id);
00269 if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
00270 found_unk_wrongly = 1;
00271 printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
00272 }
00273 if (current_id > arpa_ng->vocab_size) {
00274 quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",
00275 context[i]);
00276 }
00277 else {
00278 short_current_id = current_id;
00279 }
00280 }
00281 else {
00282 sih_lookup(ng->vocab_ht,current_word,¤t_id);
00283 if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
00284 found_unk_wrongly = 1;
00285 printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
00286 }
00287 if (current_id > ng->vocab_size) {
00288 quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]);
00289 }
00290 else {
00291 short_current_id = current_id;
00292 }
00293 }
00294
00295 if (!found_unk_wrongly) {
00296
00297 if (current_id == 0 && out_oovs) {
00298 fprintf(oov_fp,"%s\n",current_word);
00299 }
00300
00301 if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
00302 || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {
00303
00304 if (include_unks || current_id != 0) {
00305
00306 prob = calc_prob_of(short_current_id,
00307 context,
00308 context_length,
00309 ng,
00310 arpa_ng,
00311 fb_list,
00312 &bo_case,
00313 &actual_context_length,
00314 arpa_lm);
00315
00316
00317 if (prob<= 0.0 || prob > 1.0) {
00318 fprintf(stderr,"Warning : ");
00319 if (short_current_id == 0){
00320 fprintf(stderr,"P( <UNK> | ");
00321 }
00322 else {
00323 fprintf(stderr,"P( %s | ",current_word);
00324 }
00325
00326 for (i=0;i<=actual_context_length-1;i++) {
00327 if (context[i+context_length-actual_context_length] == 0) {
00328 fprintf(stderr,"<UNK> ");
00329 }
00330 else {
00331 fprintf(stderr,"%s ",prev_words[i]);
00332 }
00333 }
00334 fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
00335 fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
00336 bo_case, actual_context_length);
00337 }
00338
00339 if (annotate) {
00340 if (short_current_id == 0){
00341 fprintf(annotation_fp,"P( <UNK> | ");
00342 }
00343 else {
00344 fprintf(annotation_fp,"P( %s | ",current_word);
00345 }
00346
00347 for (i=0;i<=actual_context_length-1;i++) {
00348 if (context[i+context_length-actual_context_length] == 0) {
00349 fprintf(annotation_fp,"<UNK> ");
00350 }
00351 else {
00352 if (arpa_lm) {
00353 fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
00354 }
00355 else {
00356 fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
00357 }
00358 }
00359 }
00360 fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
00361 decode_bo_case(bo_case,actual_context_length,annotation_fp);
00362 }
00363
00364
00365
00366
00367
00368 for (i=actual_context_length-1;i>=0;i--) {
00369 int four_raise_i = 1<<(2*i);
00370
00371
00372
00373
00374
00375
00376
00377
00378 if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
00379 ngrams_hit[i+1]++;
00380 i = -2;
00381 }
00382 else {
00383 bo_case -= ((bo_case / four_raise_i) * four_raise_i);
00384 }
00385 }
00386
00387 if (i != -3) {
00388 ngrams_hit[0]++;
00389 }
00390
00391 if (out_probs) {
00392 fprintf(probs_stream_fp,"%g\n",prob);
00393 }
00394
00395 sum_log_prob += log10(prob);
00396
00397 }
00398
00399 if (current_id == 0 && !include_unks) {
00400 excluded_unks++;
00401 }
00402
00403
00404 }
00405 else {
00406 if (((!arpa_lm) && ng->context_cue[current_id]) ||
00407 (arpa_lm && arpa_ng->context_cue[current_id])) {
00408 excluded_ccs++;
00409 }
00410 }
00411 total_words++;
00412 }
00413 }
00414
00415 }
00416 if (!found_unk_wrongly) {
00417
00418
00419 printf("Perplexity = %.2f, Entropy = %.2f bits\n",
00420 exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) *
00421 log(10.0)),
00422 (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) *
00423 log(10.0) / log(2.0)));
00424
00425
00426 printf("Computation based on %d words.\n",
00427 total_words-excluded_ccs-excluded_unks);
00428 for(i=n;i>=1;i--) {
00429 printf("Number of %d-grams hit = %d (%.2f%%)\n",i,ngrams_hit[i-1],
00430 (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
00431 }
00432 printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
00433 excluded_unks,
00434 (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
00435
00436 }
00437
00438 rr_iclose(text_stream_fp);
00439
00440 if (out_probs) {
00441 rr_oclose(probs_stream_fp);
00442 }
00443 if (annotate) {
00444 rr_oclose(annotation_fp);
00445 }
00446 if (out_oovs) {
00447 rr_oclose(oov_fp);
00448 }
00449
00450 free (fb_list);
00451 free (context);
00452 free (ngrams_hit);
00453 }