00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017
00018
00020 namespace SimpleKLParameter {
00021 enum SmoothMethod {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2,
00022 TWOSTAGE=3};
00023
00024 enum SmoothStrategy {INTERPOLATE=0, BACKOFF=1};
00025
00026 enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00027
00028 enum adjustedScoreMethods {QUERYLIKELIHOOD = 1, CROSSENTROPY = 2,
00029 NEGATIVEKLD = 3};
00030
00031 struct DocSmoothParam {
00033 enum SmoothMethod smthMethod;
00035 enum SmoothStrategy smthStrategy;
00037 double ADDelta;
00039 double JMLambda;
00041 double DirPrior;
00042 };
00043
00044 static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00045 static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00046 static double defaultADDelta = 0.7;
00047 static double defaultJMLambda = 0.5;
00048 static double defaultDirPrior = 1000;
00049
00050 struct QueryModelParam {
00051 enum adjustedScoreMethods adjScoreMethod;
00053 double qryNoise;
00054
00056 enum QueryUpdateMethod fbMethod;
00058 double fbCoeff;
00060 int fbTermCount;
00062 double fbPrTh;
00064 double fbPrSumTh;
00066 double fbMixtureNoise;
00068 int emIterations;
00069 };
00070
00071 static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00072 static double defaultFBCoeff = 0.5;
00073 static int defaultFBTermCount =50;
00074 static double defaultFBPrTh = 0.001;
00075 static double defaultFBPrSumTh = 1;
00076 static double defaultFBMixNoise = 0.5;
00077 static int defaultEMIterations = 50;
00078 static double defaultQryNoise = 0;
00079 };
00080
00081
00083
00096 class SimpleKLDocModel : public DocumentRep {
00097 public:
00098 SimpleKLDocModel(DOCID_T docID, const UnigramLM &collectLM, int dl = 1,
00099 const double *prMass = NULL,
00100 SimpleKLParameter::SmoothStrategy strat = SimpleKLParameter::INTERPOLATE) :
00101 DocumentRep(docID, dl),
00102 refLM(collectLM), docPrMass(prMass), strategy(strat) {
00103 };
00104
00105 ~SimpleKLDocModel() {};
00106
00108 virtual double termWeight(TERMID_T termID, const DocInfo *info) const {
00109 double sp = seenProb(info->termCount(), termID);
00110 double usp = unseenCoeff();
00111 double ref = refLM.prob(termID);
00112 double score = sp/(usp*ref);
00113
00114
00115
00116
00117 return score;
00118 }
00119
00121 virtual double scoreConstant() const {
00122 return unseenCoeff();
00123 }
00124
00126 virtual double unseenCoeff() const =0;
00128
00129
00130 protected:
00131 const UnigramLM &refLM;
00132 const double *docPrMass;
00133 SimpleKLParameter::SmoothStrategy strategy;
00134 };
00135
00136
00137
00139
00147 class JelinekMercerDocModel : public SimpleKLDocModel {
00148 public:
00149 JelinekMercerDocModel(DOCID_T docID,
00150 int dl,
00151 const UnigramLM &collectLM,
00152 const double *docProbMass,
00153 double collectLMWeight,
00154 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00155 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00156 lambda(collectLMWeight) {
00157 };
00158
00159 virtual ~JelinekMercerDocModel() {};
00160
00161 virtual double unseenCoeff() const {
00162 if (strategy == SimpleKLParameter::INTERPOLATE) {
00163 return lambda;
00164 } else if (strategy==SimpleKLParameter::BACKOFF) {
00165 return lambda/(1-docPrMass[id]);
00166 } else {
00167 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00168 }
00169 }
00170 virtual double seenProb(double termFreq, TERMID_T termID) const {
00171 if (strategy == SimpleKLParameter::INTERPOLATE) {
00172 return ((1-lambda)*termFreq/(double)docLength +
00173 lambda*refLM.prob(termID));
00174 } else if (strategy == SimpleKLParameter::BACKOFF) {
00175 return ((1-lambda)*termFreq/(double)docLength);
00176 } else {
00177 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00178 }
00179 }
00180 private:
00181 double lambda;
00182 };
00183
00185
00190 class DirichletPriorDocModel : public SimpleKLDocModel {
00191 public:
00192 DirichletPriorDocModel(DOCID_T docID,
00193 int dl,
00194 const UnigramLM &collectLM,
00195 const double *docProbMass,
00196 double priorWordCount,
00197 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00198 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00199 mu(priorWordCount) {
00200 };
00201
00202 virtual ~DirichletPriorDocModel() {};
00203
00204 virtual double unseenCoeff() const {
00205
00206 if (strategy == SimpleKLParameter::INTERPOLATE) {
00207 return mu/(mu+docLength);
00208 } else if (strategy==SimpleKLParameter::BACKOFF) {
00209 return (mu/((mu+docLength)*(1-docPrMass[id])));
00210 } else {
00211 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00212 }
00213 }
00214
00215 virtual double seenProb(double termFreq, TERMID_T termID) const {
00216 if (strategy == SimpleKLParameter::INTERPOLATE) {
00217 return (termFreq+mu*refLM.prob(termID))/
00218 (double)(docLength+mu);
00219 } else if (strategy == SimpleKLParameter::BACKOFF) {
00220 return (termFreq/(double)(docLength+mu));
00221 } else {
00222 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00223 }
00224 }
00225 private:
00226 double mu;
00227 };
00228
00230
00237 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00238 public:
00239 AbsoluteDiscountDocModel(DOCID_T docID,
00240 int dl,
00241 const UnigramLM &collectLM,
00242 const double *docProbMass,
00243 COUNT_T *uniqueTermCount,
00244 double discount,
00245 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00246 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00247 uniqDocLen(uniqueTermCount),
00248 delta(discount) {
00249 };
00250
00251 virtual ~AbsoluteDiscountDocModel() {};
00252
00253 virtual double unseenCoeff() const {
00254
00255 if (strategy == SimpleKLParameter::INTERPOLATE) {
00256 return (delta*uniqDocLen[id]/(double)docLength);
00257 } else if (strategy==SimpleKLParameter::BACKOFF) {
00258 return (delta*uniqDocLen[id]/(docLength*(1-docPrMass[id])));
00259 } else {
00260 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00261 }
00262 }
00263 virtual double seenProb(double termFreq, TERMID_T termID) const {
00264 if (strategy == SimpleKLParameter::INTERPOLATE) {
00265 return ((termFreq-delta)/(double)docLength+
00266 delta*uniqDocLen[id]*refLM.prob(termID)/(double)docLength);
00267 } else if (strategy == SimpleKLParameter::BACKOFF) {
00268 return ((termFreq-delta)/(double)docLength);
00269 } else {
00270 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00271 }
00272 }
00273 private:
00274 double *collectPr;
00275 COUNT_T *uniqDocLen;
00276 double delta;
00277 };
00278
00279
00281
00282
00283 class TwoStageDocModel : public SimpleKLDocModel {
00284 public:
00285 TwoStageDocModel(DOCID_T docID,
00286 int dl,
00287 const UnigramLM &collectLM,
00288 const double *docProbMass,
00289 double firstStageMu,
00290 double secondStageLambda,
00291 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00292 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00293 mu(firstStageMu),
00294 lambda(secondStageLambda) {
00295 };
00296
00297 virtual ~TwoStageDocModel() {};
00298
00299 virtual double unseenCoeff() const {
00300
00301 if (strategy == SimpleKLParameter::INTERPOLATE) {
00302 return (mu+lambda*docLength)/(mu+docLength);
00303 } else if (strategy == SimpleKLParameter::BACKOFF) {
00304 return ((mu+lambda*docLength)/((mu+docLength)*(1-docPrMass[id])));
00305 } else {
00306 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00307 }
00308 }
00309
00310 virtual double seenProb(double termFreq, TERMID_T termID) const {
00311 if (strategy == SimpleKLParameter::INTERPOLATE) {
00312 return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00313 (double)(docLength+mu) + lambda*refLM.prob(termID));
00314 } else if (strategy == SimpleKLParameter::BACKOFF) {
00315 return (termFreq*(1-lambda)/(double)(docLength+mu));
00316 } else {
00317 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00318 }
00319 }
00320 private:
00321 double mu;
00322 double lambda;
00323 };
00324
00325 #endif
00326
00327
00328
00329
00330