stats.c Source File

00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #include "evallm.h"
00021 
00027 void display_stats(ng_t *ng) {
00028   
00029   int i;
00030   int j;
00031 
00032   fprintf(stderr,"This is a %d-gram language model, based on a vocabulary of %d words,\n",ng->n,ng->vocab_size);
00033   fprintf(stderr,"  which begins \"%s\", \"%s\", \"%s\"...\n",ng->vocab[1],ng->vocab[2],ng->vocab[3]);
00034 
00035   if (ng->no_of_ccs == 1) {
00036     fprintf(stderr,"There is 1 context cue.");
00037   }
00038   else {
00039     fprintf(stderr,"There are %d context cues.\n",ng->no_of_ccs);
00040   }
00041   if (ng->no_of_ccs > 0 && ng->no_of_ccs < 10) {
00042     if (ng->no_of_ccs == 1) {
00043       fprintf(stderr,"This is : ");
00044     }
00045     else {
00046       fprintf(stderr,"These are : ");
00047     }
00048     for (i=ng->first_id;i<=ng->vocab_size;i++) {
00049       if (ng->context_cue[i]) {
00050         fprintf(stderr,"\"%s\" ",ng->vocab[i]);
00051       }
00052     }
00053     fprintf(stderr,"\n");
00054   }
00055 
00056   if (ng->vocab_type == CLOSED_VOCAB) {
00057     fprintf(stderr,"This is a CLOSED-vocabulary model\n");
00058     fprintf(stderr,"  (OOVs eliminated from training data and are forbidden in test data)\n");
00059   }
00060   else {
00061     if (ng->vocab_type == OPEN_VOCAB_1) {
00062       fprintf(stderr,"This is an OPEN-vocabulary model (type 1)\n");
00063       fprintf(stderr,"  (OOVs were mapped to UNK, which is treated as any other vocabulary word)\n");
00064     }
00065     else {
00066       if (ng->vocab_type == OPEN_VOCAB_2) {
00067         fprintf(stderr,"This is an OPEN-vocabulary model (type 2)\n");
00068         fprintf(stderr,"  (%.2f of the unigram discount mass was allocated to OOVs)\n",ng->oov_fraction); 
00069       }
00070     }
00071   }
00072 
00073   if (ng->four_byte_alphas) {
00074     fprintf(stderr,"The back-off weights are stored in four bytes.\n");
00075   }
00076   else {
00077     fprintf(stderr,"The back-off weights are stored in two bytes.\n");
00078   }
00079   
00080   for (i=2;i<=ng->n;i++) {
00081     fprintf(stderr,"The %d-gram component was based on %d %d-grams.\n",i,ng->num_kgrams[i-1],i);
00082   }
00083 
00084   switch (ng->discounting_method) {
00085   case GOOD_TURING:
00086     fprintf(stderr,"Good-Turing discounting was applied.\n");
00087     for (i=1;i<=ng->n;i++) {
00088       fprintf(stderr,"%d-gram frequency of frequency : ",i);
00089       for (j=1;j<=ng->fof_size[i-1]-1;j++) {
00090         fprintf(stderr,"%d ",ng->freq_of_freq[i-1][j]);
00091       }
00092       fprintf(stderr,"\n");
00093     }
00094     for (i=1;i<=ng->n;i++) {
00095       fprintf(stderr,"%d-gram discounting ratios : ",i);
00096       for (j=1;j<=ng->disc_range[i-1];j++) {
00097         fprintf(stderr,"%.2f ",ng->gt_disc_ratio[i-1][j]);
00098       }
00099       fprintf(stderr,"\n");
00100     }
00101     break;
00102   case LINEAR:
00103     fprintf(stderr,"Linear discounting was applied.\n");
00104     for (i=1;i<=ng->n;i++) {
00105       fprintf(stderr,"%d-gram discounting ratio : %g\n",i,ng->lin_disc_ratio[i-1]);
00106     }
00107     break;
00108   case ABSOLUTE:
00109     fprintf(stderr,"Absolute discounting was applied.\n");
00110     for (i=1;i<=ng->n;i++) {
00111       fprintf(stderr,"%d-gram discounting constant : %g\n",i,ng->abs_disc_const[i-1]);
00112     }
00113     break;
00114   case WITTEN_BELL:
00115     fprintf(stderr,"Witten Bell discounting was applied.\n");
00116     break;
00117   }
00118 
00119 }
00120 
00122 void display_arpa_stats(arpa_lm_t *arpa_ng) {
00123 
00124   int i;
00125 
00126   fprintf(stderr,"This is a %d-gram language model, based on a vocabulary of %d words,\n",arpa_ng->n,arpa_ng->vocab_size);
00127   fprintf(stderr,"  which begins \"%s\", \"%s\", \"%s\"...\n",
00128           arpa_ng->vocab[1],arpa_ng->vocab[2],arpa_ng->vocab[3]);
00129   
00130   if (arpa_ng->no_of_ccs == 1) {
00131     fprintf(stderr,"There is 1 context cue.");
00132   }
00133   else {
00134     fprintf(stderr,"There are %d context cues.\n",arpa_ng->no_of_ccs);
00135   }
00136   if (arpa_ng->no_of_ccs > 0 && arpa_ng->no_of_ccs < 10) {
00137     if (arpa_ng->no_of_ccs == 1) {
00138       fprintf(stderr,"This is : ");
00139     }
00140     else {
00141       fprintf(stderr,"These are : ");
00142     }
00143     for (i=arpa_ng->first_id;i<=arpa_ng->vocab_size;i++) {
00144       if (arpa_ng->context_cue[i]) {
00145         fprintf(stderr,"\"%s\" ",arpa_ng->vocab[i]);
00146       }
00147     }
00148     fprintf(stderr,"\n");
00149   }
00150 
00151   if (arpa_ng->vocab_type == CLOSED_VOCAB) {
00152     fprintf(stderr,"This is a CLOSED-vocabulary model\n");
00153     fprintf(stderr,"  (OOVs eliminated from training data and are forbidden in test data)\n");
00154   }
00155   else {
00156     if (arpa_ng->vocab_type == OPEN_VOCAB_1) {
00157       fprintf(stderr,"This is an OPEN-vocabulary model (type 1)\n");
00158       fprintf(stderr,"  (OOVs were mapped to UNK, which is treated as any other vocabulary word)\n");
00159     }
00160     else {
00161       if (arpa_ng->vocab_type == OPEN_VOCAB_2) {
00162         fprintf(stderr,"This is an OPEN-vocabulary model (type 2)\n");
00163       }
00164     }
00165   }
00166 
00167   for (i=2;i<=arpa_ng->n;i++) {
00168     fprintf(stderr,"The %d-gram component was based on %d %d-grams.\n",i,
00169             arpa_ng->num_kgrams[i-1],i);
00170   }
00171 
00172 }