00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _SIMPLEKLDOCMODEL_HPP
00014 #define _SIMPLEKLDOCMODEL_HPP
00015
00016 #include "DocumentRep.hpp"
00017 #include "Index.hpp"
00018 #include "UnigramLM.hpp"
00019
00020
00022 namespace SimpleKLParameter {
00023 enum SmoothMethod {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2};
00024
00025 enum SmoothStrategy {INTERPOLATE=0, BACKOFF=1};
00026
00027 enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2};
00028
00029 struct DocSmoothParam {
00031 enum SmoothMethod smthMethod;
00033 enum SmoothStrategy smthStrategy;
00035 double ADDelta;
00037 double JMLambda;
00039 double DirPrior;
00040 };
00041
00042 static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00043 static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00044 static double defaultADDelta = 0.7;
00045 static double defaultJMLambda = 0.5;
00046 static double defaultDirPrior = 1000;
00047
00048 struct QueryModelParam {
00050 enum QueryUpdateMethod fbMethod;
00052 double fbCoeff;
00054 int fbTermCount;
00056 double fbPrTh;
00058 double fbPrSumTh;
00060 double fbMixtureNoise;
00062 int emIterations;
00063 };
00064
00065 static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00066 static double defaultFBCoeff = 0.5;
00067 static int defaultFBTermCount =50;
00068 static double defaultFBPrTh = 0.001;
00069 static double defaultFBPrSumTh = 1;
00070 static double defaultFBMixNoise = 0.5;
00071 static int defaultEMIterations = 50;
00072
00073 };
00074
00075
00077
00090 class SimpleKLDocModel : public DocumentRep {
00091 public:
00092 SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00093 ~SimpleKLDocModel() {};
00094
00096 virtual double termWeight(int termID, DocInfo *info) {
00097 return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00098 }
00099
00101 virtual double scoreConstant() {
00102 return unseenCoeff();
00103 }
00104
00106 virtual double unseenCoeff()=0;
00108
00109 protected:
00110 UnigramLM &refLM;
00111 };
00112
00113
00114
00116
00124 class JelinekMercerDocModel : public SimpleKLDocModel {
00125 public:
00126 JelinekMercerDocModel(int docID,
00127 Index *referenceIndex,
00128 UnigramLM &collectLM,
00129 double *docProbMass,
00130 double collectLMWeight,
00131 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00132 SimpleKLDocModel(docID, collectLM),
00133 refIndex(referenceIndex),
00134 docPrMass(docProbMass),
00135 lambda(collectLMWeight),
00136 strategy(smthStrategy) {
00137 };
00138
00139 virtual ~JelinekMercerDocModel() {};
00140
00141 virtual double unseenCoeff() {
00142 if (strategy == SimpleKLParameter::INTERPOLATE) {
00143 return lambda;
00144 } else if (strategy==SimpleKLParameter::BACKOFF) {
00145 return lambda/(1-docPrMass[id]);
00146 } else {
00147 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00148 }
00149 }
00150 virtual double seenProb(double termFreq, int termID) {
00151 if (strategy == SimpleKLParameter::INTERPOLATE) {
00152 return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00153 lambda*refLM.prob(termID));
00154 } else if (strategy == SimpleKLParameter::BACKOFF) {
00155 return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00156 } else {
00157 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00158 }
00159 }
00160 private:
00161 Index *refIndex;
00162 double *docPrMass;
00163 double lambda;
00164 SimpleKLParameter::SmoothStrategy strategy;
00165 };
00166
00168
00173 class DirichletPriorDocModel : public SimpleKLDocModel {
00174 public:
00175 DirichletPriorDocModel(int docID,
00176 Index *referenceIndex,
00177 UnigramLM &collectLM,
00178 double *docProbMass,
00179 double priorWordCount,
00180 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00181 SimpleKLDocModel(docID, collectLM),
00182 refIndex(referenceIndex),
00183 docPrMass(docProbMass),
00184 mu(priorWordCount),
00185 strategy(smthStrategy) {
00186 };
00187
00188 virtual ~DirichletPriorDocModel() {};
00189
00190 virtual double unseenCoeff() {
00191
00192 if (strategy == SimpleKLParameter::INTERPOLATE) {
00193 return mu/(mu+refIndex->docLength(id));
00194 } else if (strategy==SimpleKLParameter::BACKOFF) {
00195 return (mu/((mu+refIndex->docLength(id))*
00196 (1-docPrMass[id])));
00197 } else {
00198 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00199 }
00200 }
00201
00202 virtual double seenProb(double termFreq, int termID) {
00203 if (strategy == SimpleKLParameter::INTERPOLATE) {
00204 return (termFreq+mu*refLM.prob(termID))/
00205 (double)(refIndex->docLength(id)+mu);
00206 } else if (strategy == SimpleKLParameter::BACKOFF) {
00207 return (termFreq/
00208 (double)(refIndex->docLength(id)+mu));
00209 } else {
00210 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00211 }
00212 }
00213 private:
00214 Index *refIndex;
00215 double *docPrMass;
00216 double mu;
00217 SimpleKLParameter::SmoothStrategy strategy;
00218 };
00219
00221
00228 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00229 public:
00230 AbsoluteDiscountDocModel(int docID,
00231 Index *referenceIndex,
00232 UnigramLM &collectLM,
00233 double *docProbMass,
00234 int *uniqueTermCount,
00235 double discount,
00236 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00237 SimpleKLDocModel(docID, collectLM),
00238 refIndex(referenceIndex),
00239 docPrMass(docProbMass),
00240 uniqDocLen(uniqueTermCount),
00241 delta(discount),
00242 strategy(smthStrategy) {
00243 };
00244
00245 virtual ~AbsoluteDiscountDocModel() {};
00246
00247 virtual double unseenCoeff() {
00248
00249 if (strategy == SimpleKLParameter::INTERPOLATE) {
00250 return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00251 } else if (strategy==SimpleKLParameter::BACKOFF) {
00252 return (delta*uniqDocLen[id]/
00253 (refIndex->docLength(id)*(1-docPrMass[id])));
00254 } else {
00255 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00256 }
00257 }
00258 virtual double seenProb(double termFreq, int termID) {
00259 if (strategy == SimpleKLParameter::INTERPOLATE) {
00260 return ((termFreq-delta)/(double)refIndex->docLength(id)+
00261 delta*uniqDocLen[id]*refLM.prob(termID)/
00262 (double)refIndex->docLength(id));
00263 } else if (strategy == SimpleKLParameter::BACKOFF) {
00264 return ((termFreq-delta)/(double)refIndex->docLength(id));
00265 } else {
00266 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00267 }
00268 }
00269 private:
00270 Index *refIndex;
00271 double *collectPr;
00272 double *docPrMass;
00273 int *uniqDocLen;
00274 double delta;
00275 SimpleKLParameter::SmoothStrategy strategy;
00276 };
00277
00278
00279 #endif
00280
00281
00282
00283
00284