perplexity.c Source File

00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #include "evallm.h"
00021 #include <math.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 
00028 void compute_perplexity(ng_t *ng,
00029                         arpa_lm_t *arpa_ng,
00030                         char *text_stream_filename,
00031                         char *probs_stream_filename,
00032                         char *annotation_filename,
00033                         char *oov_filename,
00034                         char *fb_list_filename,
00035                         flag backoff_from_unk_inc,
00036                         flag backoff_from_unk_exc,
00037                         flag backoff_from_ccs_inc,
00038                         flag backoff_from_ccs_exc,
00039                         flag arpa_lm,
00040                         flag include_unks,
00041                         double log_base) {
00042 
00043   fb_info *fb_list;
00044   FILE *temp_fp;
00045   FILE *text_stream_fp;
00046   FILE *probs_stream_fp;
00047   FILE *annotation_fp;
00048   FILE *oov_fp;
00049   flag out_probs;
00050   flag annotate;
00051   flag out_oovs;
00052   flag found_unk_wrongly;
00053   double prob;
00054   double sum_log_prob;
00055   int total_words;
00056   int excluded_unks;
00057   int excluded_ccs;
00058   char current_word[1000];  /* Hope that's big enough */
00059   char **prev_words;
00060   int current_id;
00061   id__t short_current_id;
00062   id__t *context;
00063   int context_length;
00064   int i;
00065   int bo_case;
00066   int actual_context_length;
00067   int *ngrams_hit;
00068   int n;
00069 
00070   /* Initialise file pointers to prevent warnings from the compiler. */
00071 
00072   probs_stream_fp = NULL;
00073   annotation_fp = NULL;
00074   oov_fp = NULL;
00075 
00076   short_current_id = 0;
00077 
00078   found_unk_wrongly = 0;
00079 
00080   annotate = 0;
00081 
00082   bo_case = 0;
00083 
00084   if (arpa_lm) {
00085     n = arpa_ng->n;
00086     fb_list = gen_fb_list(arpa_ng->vocab_ht,
00087                           arpa_ng->vocab_size,
00088                           arpa_ng->vocab,
00089                           arpa_ng->context_cue,
00090                           backoff_from_unk_inc,
00091                           backoff_from_unk_exc,
00092                           backoff_from_ccs_inc,
00093                           backoff_from_ccs_exc,
00094                           fb_list_filename);
00095   }
00096   else {
00097     n = ng->n;
00098     fb_list = gen_fb_list(ng->vocab_ht,
00099                           ng->vocab_size,
00100                           ng->vocab,
00101                           ng->context_cue,
00102                           backoff_from_unk_inc,
00103                           backoff_from_unk_exc,
00104                           backoff_from_ccs_inc,
00105                           backoff_from_ccs_exc,
00106                           fb_list_filename);
00107   }
00108   
00109   ngrams_hit = (int *) rr_calloc(n,sizeof(int));
00110   prev_words = (char **) rr_malloc(sizeof(char *)*n);
00111   for (i=0;i<=n-1;i++) {
00112     prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
00113   }
00114   
00115   /* Check that text_stream_filename and probs_stream_filename (if
00116      specified) are valid. Note that the checks employed by the
00117      standard rr_fopen tools are not suitable here, since we don't
00118      want the program to terminate if the paths are not found. */
00119 
00120   if (!strcmp(text_stream_filename,"")) {
00121     printf("Error : Must specify a text file. Use the -text switch.\n");
00122     return;
00123   }
00124 
00125   if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
00126     printf("Error : Can't open file %s for reading.\n",text_stream_filename);
00127     return;
00128   }
00129 
00130   out_probs = strcmp(probs_stream_filename,"");
00131   annotate = strcmp(annotation_filename,"");
00132   out_oovs = strcmp(oov_filename,"");
00133 
00134   printf("Computing perplexity of the language model with respect\n");
00135   printf("   to the text %s\n",text_stream_filename);
00136   if (out_probs) {
00137     printf("Probability stream will be written to file %s\n",
00138             probs_stream_filename);
00139   }
00140   if (annotate) {
00141     printf("Annotation will be written to file %s\n",
00142             annotation_filename);
00143   }
00144   if (out_oovs) {
00145     printf("Out of vocabulary words will be written to file %s\n",
00146             oov_filename);
00147   }
00148 
00149   if (backoff_from_unk_inc) {
00150     printf("Will force inclusive back-off from OOVs.\n");
00151   }
00152 
00153   if (backoff_from_unk_exc) {
00154     printf("Will force exclusive back-off from OOVs.\n");
00155   }
00156 
00157   if (backoff_from_ccs_inc) {
00158     printf("Will force inclusive back-off from context cues.\n");
00159   }
00160 
00161   if (backoff_from_ccs_exc) {
00162     printf("Will force exclusive back-off from context cues.\n");
00163   }
00164 
00165   if (strcmp(fb_list_filename,"")) {
00166     printf("Will force back-off according to the contents of %s\n",
00167             fb_list_filename);
00168   }
00169 
00170   if (include_unks) {
00171     printf("Perplexity calculation will include OOVs.\n");
00172   }
00173 
00174   /* Check for existance of files, as rr functions will quit, which isn't
00175      what we want */
00176 
00177   if (out_probs && strcmp(probs_stream_filename,"-")) {
00178     if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
00179       printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
00180       return;
00181     }
00182     fclose(temp_fp);
00183   }
00184 
00185   if (annotate && strcmp(annotation_filename,"-")) {
00186     if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
00187       printf("Error : Can't open file %s for writing.\n",annotation_filename);
00188       return;
00189     }
00190     fclose(temp_fp);
00191   }
00192     
00193   if (out_oovs && strcmp(oov_filename,"-")) {
00194     if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
00195       printf("Error : Can't open file %s for writing.\n",oov_filename);
00196       return;
00197     }
00198     fclose(temp_fp);
00199   }
00200 
00201   text_stream_fp = rr_iopen(text_stream_filename);
00202   if (out_probs) {
00203     probs_stream_fp = rr_oopen(probs_stream_filename);
00204   }
00205 
00206   if (annotate) {
00207     annotation_fp = rr_oopen(annotation_filename);
00208   }
00209 
00210   if (out_oovs) {
00211     oov_fp = rr_oopen(oov_filename);
00212   }
00213 
00214   context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));
00215 
00216   sum_log_prob = 0.0;
00217   total_words = 0;
00218   excluded_unks = 0;
00219   excluded_ccs = 0;
00220 
00221   while (!rr_feof(text_stream_fp)) {
00222 
00223     if (total_words > 0) {
00224       if (total_words < n) {
00225         strcpy(prev_words[total_words-1],current_word);
00226       }
00227       else {
00228         for (i=0;i<=n-3;i++) {
00229           strcpy(prev_words[i],prev_words[i+1]);
00230         }
00231         if (n>1) {
00232           strcpy(prev_words[n-2],current_word);
00233         }
00234       }
00235     }
00236 
00237     if (total_words < (n-1)) {
00238       context_length = total_words;
00239     }
00240     else {
00241       context_length = n-1;
00242     }
00243 
00244     /* Fill context with right stuff */
00245 
00246     if (total_words > (n-1)) {
00247 
00248       for (i=0;i<=context_length-2;i++) {
00249         context[i] = context[i+1];
00250       }
00251       
00252     }
00253 
00254     if (context_length != 0){
00255       context[context_length-1] = short_current_id;
00256     }
00257 
00258     if (fscanf(text_stream_fp,"%s",current_word) != 1) {
00259       if (!rr_feof(text_stream_fp)) {
00260         printf("Error reading text file.\n");
00261         return;
00262       }
00263     }
00264 
00265     if (!rr_feof(text_stream_fp)) {
00266 
00267       if (arpa_lm) {
00268         sih_lookup(arpa_ng->vocab_ht,current_word,&current_id);
00269         if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
00270           found_unk_wrongly = 1;
00271           printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
00272         }
00273         if (current_id > arpa_ng->vocab_size) {
00274           quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",
00275                context[i]); 
00276         }
00277         else {
00278           short_current_id = current_id;
00279         }
00280       }
00281       else {
00282         sih_lookup(ng->vocab_ht,current_word,&current_id);
00283         if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
00284           found_unk_wrongly = 1;
00285           printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
00286         }
00287         if (current_id > ng->vocab_size) {
00288           quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
00289         }
00290         else {
00291           short_current_id = current_id;
00292         }
00293       }
00294     
00295       if (!found_unk_wrongly) {
00296 
00297         if (current_id == 0 && out_oovs) {
00298           fprintf(oov_fp,"%s\n",current_word);
00299         }
00300 
00301         if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
00302             || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {
00303 
00304           if (include_unks || current_id != 0) {
00305 
00306             prob = calc_prob_of(short_current_id,
00307                                 context,
00308                                 context_length,
00309                                 ng,
00310                                 arpa_ng,
00311                                 fb_list,
00312                                 &bo_case,
00313                                 &actual_context_length,
00314                                 arpa_lm);
00315 
00316 
00317             if (prob<= 0.0 || prob > 1.0) {
00318               fprintf(stderr,"Warning : ");
00319               if (short_current_id == 0){
00320                 fprintf(stderr,"P( <UNK> | ");
00321               }
00322               else {
00323                 fprintf(stderr,"P( %s | ",current_word);
00324               }
00325           
00326               for (i=0;i<=actual_context_length-1;i++) {
00327                 if (context[i+context_length-actual_context_length] == 0) {
00328                   fprintf(stderr,"<UNK> ");
00329                 }
00330                 else {
00331                   fprintf(stderr,"%s ",prev_words[i]);
00332                 }
00333               }
00334               fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
00335               fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
00336                       bo_case, actual_context_length);
00337             }
00338           
00339             if (annotate) {
00340               if (short_current_id == 0){
00341                 fprintf(annotation_fp,"P( <UNK> | ");
00342               }
00343               else {
00344                 fprintf(annotation_fp,"P( %s | ",current_word);
00345               }
00346           
00347               for (i=0;i<=actual_context_length-1;i++) {
00348                 if (context[i+context_length-actual_context_length] == 0) {
00349                   fprintf(annotation_fp,"<UNK> ");
00350                 }
00351                 else {
00352                   if (arpa_lm) {
00353                     fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
00354                   }
00355                   else {
00356                     fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
00357                   }
00358                 }
00359               }
00360               fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
00361               decode_bo_case(bo_case,actual_context_length,annotation_fp);
00362             }
00363 
00364             /* Calculate level to which we backed off */
00365 
00366 
00367   
00368             for (i=actual_context_length-1;i>=0;i--) {
00369               int four_raise_i = 1<<(2*i);  /* PWP */
00370  
00371               /*
00372                * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)"
00373                * but was getting a divide-by-zero error on an Alpha
00374                * (it isn't clear to me why it should ever have done so)
00375                * Anyway, it is much faster to do in base-4.
00376                */
00377 
00378               if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
00379                 ngrams_hit[i+1]++;
00380                 i = -2;
00381               }
00382               else {
00383                 bo_case -= ((bo_case / four_raise_i) * four_raise_i);
00384               }
00385             }
00386   
00387             if (i != -3) { 
00388               ngrams_hit[0]++;
00389             }
00390 
00391             if (out_probs) {
00392               fprintf(probs_stream_fp,"%g\n",prob);
00393             }
00394       
00395             sum_log_prob += log10(prob);
00396                           
00397           }
00398 
00399           if (current_id == 0 && !include_unks) {
00400             excluded_unks++;
00401           }
00402 
00403 
00404         }       
00405         else {
00406           if (((!arpa_lm) && ng->context_cue[current_id]) || 
00407               (arpa_lm && arpa_ng->context_cue[current_id])) {
00408             excluded_ccs++;
00409           }
00410         }
00411         total_words++;
00412       }
00413     }
00414 
00415   }
00416   if (!found_unk_wrongly) {
00417 
00418      /*  pow(x,y) = e**(y  ln(x)) */
00419      printf("Perplexity = %.2f, Entropy = %.2f bits\n", 
00420             exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
00421                 log(10.0)),
00422             (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
00423              log(10.0) / log(2.0)));
00424 
00425     
00426     printf("Computation based on %d words.\n",
00427            total_words-excluded_ccs-excluded_unks);
00428     for(i=n;i>=1;i--) {
00429       printf("Number of %d-grams hit = %d  (%.2f%%)\n",i,ngrams_hit[i-1],
00430              (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
00431     }
00432     printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
00433            excluded_unks,
00434            (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
00435     
00436   }
00437 
00438   rr_iclose(text_stream_fp);
00439 
00440   if (out_probs) {
00441     rr_oclose(probs_stream_fp);
00442   }
00443   if (annotate) {
00444     rr_oclose(annotation_fp);
00445   }
00446   if (out_oovs) {
00447     rr_oclose(oov_fp);
00448   }
00449 
00450   free (fb_list);
00451   free (context);
00452   free (ngrams_hit);
00453 }