00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "evallm.h"
00021
00027 void display_stats(ng_t *ng) {
00028
00029 int i;
00030 int j;
00031
00032 fprintf(stderr,"This is a %d-gram language model, based on a vocabulary of %d words,\n",ng->n,ng->vocab_size);
00033 fprintf(stderr," which begins \"%s\", \"%s\", \"%s\"...\n",ng->vocab[1],ng->vocab[2],ng->vocab[3]);
00034
00035 if (ng->no_of_ccs == 1) {
00036 fprintf(stderr,"There is 1 context cue.");
00037 }
00038 else {
00039 fprintf(stderr,"There are %d context cues.\n",ng->no_of_ccs);
00040 }
00041 if (ng->no_of_ccs > 0 && ng->no_of_ccs < 10) {
00042 if (ng->no_of_ccs == 1) {
00043 fprintf(stderr,"This is : ");
00044 }
00045 else {
00046 fprintf(stderr,"These are : ");
00047 }
00048 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00049 if (ng->context_cue[i]) {
00050 fprintf(stderr,"\"%s\" ",ng->vocab[i]);
00051 }
00052 }
00053 fprintf(stderr,"\n");
00054 }
00055
00056 if (ng->vocab_type == CLOSED_VOCAB) {
00057 fprintf(stderr,"This is a CLOSED-vocabulary model\n");
00058 fprintf(stderr," (OOVs eliminated from training data and are forbidden in test data)\n");
00059 }
00060 else {
00061 if (ng->vocab_type == OPEN_VOCAB_1) {
00062 fprintf(stderr,"This is an OPEN-vocabulary model (type 1)\n");
00063 fprintf(stderr," (OOVs were mapped to UNK, which is treated as any other vocabulary word)\n");
00064 }
00065 else {
00066 if (ng->vocab_type == OPEN_VOCAB_2) {
00067 fprintf(stderr,"This is an OPEN-vocabulary model (type 2)\n");
00068 fprintf(stderr," (%.2f of the unigram discount mass was allocated to OOVs)\n",ng->oov_fraction);
00069 }
00070 }
00071 }
00072
00073 if (ng->four_byte_alphas) {
00074 fprintf(stderr,"The back-off weights are stored in four bytes.\n");
00075 }
00076 else {
00077 fprintf(stderr,"The back-off weights are stored in two bytes.\n");
00078 }
00079
00080 for (i=2;i<=ng->n;i++) {
00081 fprintf(stderr,"The %d-gram component was based on %d %d-grams.\n",i,ng->num_kgrams[i-1],i);
00082 }
00083
00084 switch (ng->discounting_method) {
00085 case GOOD_TURING:
00086 fprintf(stderr,"Good-Turing discounting was applied.\n");
00087 for (i=1;i<=ng->n;i++) {
00088 fprintf(stderr,"%d-gram frequency of frequency : ",i);
00089 for (j=1;j<=ng->fof_size[i-1]-1;j++) {
00090 fprintf(stderr,"%d ",ng->freq_of_freq[i-1][j]);
00091 }
00092 fprintf(stderr,"\n");
00093 }
00094 for (i=1;i<=ng->n;i++) {
00095 fprintf(stderr,"%d-gram discounting ratios : ",i);
00096 for (j=1;j<=ng->disc_range[i-1];j++) {
00097 fprintf(stderr,"%.2f ",ng->gt_disc_ratio[i-1][j]);
00098 }
00099 fprintf(stderr,"\n");
00100 }
00101 break;
00102 case LINEAR:
00103 fprintf(stderr,"Linear discounting was applied.\n");
00104 for (i=1;i<=ng->n;i++) {
00105 fprintf(stderr,"%d-gram discounting ratio : %g\n",i,ng->lin_disc_ratio[i-1]);
00106 }
00107 break;
00108 case ABSOLUTE:
00109 fprintf(stderr,"Absolute discounting was applied.\n");
00110 for (i=1;i<=ng->n;i++) {
00111 fprintf(stderr,"%d-gram discounting constant : %g\n",i,ng->abs_disc_const[i-1]);
00112 }
00113 break;
00114 case WITTEN_BELL:
00115 fprintf(stderr,"Witten Bell discounting was applied.\n");
00116 break;
00117 }
00118
00119 }
00120
00122 void display_arpa_stats(arpa_lm_t *arpa_ng) {
00123
00124 int i;
00125
00126 fprintf(stderr,"This is a %d-gram language model, based on a vocabulary of %d words,\n",arpa_ng->n,arpa_ng->vocab_size);
00127 fprintf(stderr," which begins \"%s\", \"%s\", \"%s\"...\n",
00128 arpa_ng->vocab[1],arpa_ng->vocab[2],arpa_ng->vocab[3]);
00129
00130 if (arpa_ng->no_of_ccs == 1) {
00131 fprintf(stderr,"There is 1 context cue.");
00132 }
00133 else {
00134 fprintf(stderr,"There are %d context cues.\n",arpa_ng->no_of_ccs);
00135 }
00136 if (arpa_ng->no_of_ccs > 0 && arpa_ng->no_of_ccs < 10) {
00137 if (arpa_ng->no_of_ccs == 1) {
00138 fprintf(stderr,"This is : ");
00139 }
00140 else {
00141 fprintf(stderr,"These are : ");
00142 }
00143 for (i=arpa_ng->first_id;i<=arpa_ng->vocab_size;i++) {
00144 if (arpa_ng->context_cue[i]) {
00145 fprintf(stderr,"\"%s\" ",arpa_ng->vocab[i]);
00146 }
00147 }
00148 fprintf(stderr,"\n");
00149 }
00150
00151 if (arpa_ng->vocab_type == CLOSED_VOCAB) {
00152 fprintf(stderr,"This is a CLOSED-vocabulary model\n");
00153 fprintf(stderr," (OOVs eliminated from training data and are forbidden in test data)\n");
00154 }
00155 else {
00156 if (arpa_ng->vocab_type == OPEN_VOCAB_1) {
00157 fprintf(stderr,"This is an OPEN-vocabulary model (type 1)\n");
00158 fprintf(stderr," (OOVs were mapped to UNK, which is treated as any other vocabulary word)\n");
00159 }
00160 else {
00161 if (arpa_ng->vocab_type == OPEN_VOCAB_2) {
00162 fprintf(stderr,"This is an OPEN-vocabulary model (type 2)\n");
00163 }
00164 }
00165 }
00166
00167 for (i=2;i<=arpa_ng->n;i++) {
00168 fprintf(stderr,"The %d-gram component was based on %d %d-grams.\n",i,
00169 arpa_ng->num_kgrams[i-1],i);
00170 }
00171
00172 }