00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "evallm.h"
00021 #include <stdlib.h>
00022
00026 void validate(ng_t *ng,
00027 arpa_lm_t *arpa_ng,
00028 char **words,
00029 flag backoff_from_unk_inc,
00030 flag backoff_from_unk_exc,
00031 flag backoff_from_ccs_inc,
00032 flag backoff_from_ccs_exc,
00033 flag arpa_lm,
00034 char *fb_list_filename) {
00035
00036
00037 int *context;
00038 id__t *short_context;
00039 int dummy1;
00040 int dummy2;
00041 int i;
00042 fb_info *fb_list;
00043 double prob_so_far;
00044 flag found_unk_wrongly;
00045 int n;
00046
00047 if (arpa_lm) {
00048 n = arpa_ng->n;
00049 }
00050 else {
00051 n = ng->n;
00052 }
00053
00054 if (arpa_lm) {
00055 fb_list = gen_fb_list(arpa_ng->vocab_ht,
00056 arpa_ng->vocab_size,
00057 arpa_ng->vocab,
00058 arpa_ng->context_cue,
00059 backoff_from_unk_inc,
00060 backoff_from_unk_exc,
00061 backoff_from_ccs_inc,
00062 backoff_from_ccs_exc,
00063 fb_list_filename);
00064 }
00065 else {
00066 fb_list = gen_fb_list(ng->vocab_ht,
00067 ng->vocab_size,
00068 ng->vocab,
00069 ng->context_cue,
00070 backoff_from_unk_inc,
00071 backoff_from_unk_exc,
00072 backoff_from_ccs_inc,
00073 backoff_from_ccs_exc,
00074 fb_list_filename);
00075 }
00076
00077 context = (int *) rr_malloc(sizeof(int)*(n-1));
00078 short_context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));
00079
00080 found_unk_wrongly = 0;
00081
00082 for (i=0;i<=n-2;i++) {
00083 if (arpa_lm) {
00084 if (sih_lookup(arpa_ng->vocab_ht,words[i],&context[i]) == 0) {
00085 if (arpa_ng->vocab_type == CLOSED_VOCAB) {
00086 fprintf(stderr,"Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",words[i]);
00087 found_unk_wrongly = 1;
00088 }
00089 else {
00090 fprintf(stderr,"Warning : %s is an unknown word.\n",words[i]);
00091 }
00092 }
00093 if (context[i] > 65535) {
00094 quit(-1,"Error : returned value from sih_lookup is too high.\n");
00095 }
00096 else {
00097 short_context[i] = context[i];
00098 }
00099 }
00100 else {
00101 if (sih_lookup(ng->vocab_ht,words[i],&context[i]) == 0) {
00102 if (ng->vocab_type == CLOSED_VOCAB) {
00103 fprintf(stderr,"Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",words[i]);
00104 found_unk_wrongly = 1;
00105 }
00106 else {
00107 fprintf(stderr,"Warning : %s is an unknown word.\n",words[i]);
00108 }
00109 }
00110 if (context[i] > 65535) {
00111 quit(-1,"Error : returned value from sih_lookup is too high.\n");
00112 }
00113 else {
00114 short_context[i] = context[i];
00115 }
00116 }
00117 }
00118
00119
00120
00121
00122
00123 if (!found_unk_wrongly) {
00124
00125 prob_so_far = 0.0;
00126
00127 if (arpa_lm) {
00128 for (i=arpa_ng->first_id;i<=arpa_ng->vocab_size;i++) {
00129 prob_so_far += calc_prob_of(i,
00130 short_context,
00131 n-1,
00132 ng,
00133 arpa_ng,
00134 fb_list,
00135 &dummy1,
00136 &dummy2,
00137 arpa_lm);
00138 }
00139
00140 }
00141 else {
00142 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00143 prob_so_far += calc_prob_of(i,
00144 short_context,
00145 n-1,
00146 ng,
00147 arpa_ng,
00148 fb_list,
00149 &dummy1,
00150 &dummy2,
00151 arpa_lm);
00152 }
00153 }
00154
00155 printf("Sum of P( * | ");
00156 for (i=0;i<=n-2;i++) {
00157 printf("%s ",words[i]);
00158 }
00159 printf(") = %f\n",prob_so_far);
00160
00161 }
00162
00163 free(context);
00164 free(fb_list);
00165
00166 }