00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017
00018
00020 namespace SimpleKLParameter {
00021 enum SmoothMethod {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2,
00022 TWOSTAGE=3};
00023
00024 enum SmoothStrategy {INTERPOLATE=0, BACKOFF=1};
00025
00026 enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00027
00028 enum adjustedScoreMethods {QUERYLIKELIHOOD = 1, CROSSENTROPY = 2,
00029 NEGATIVEKLD = 3};
00030
00031 struct DocSmoothParam {
00033 enum SmoothMethod smthMethod;
00035 enum SmoothStrategy smthStrategy;
00037 double ADDelta;
00039 double JMLambda;
00041 double DirPrior;
00042 };
00043
00044 static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00045 static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00046 static double defaultADDelta = 0.7;
00047 static double defaultJMLambda = 0.5;
00048 static double defaultDirPrior = 1000;
00049
00050 struct QueryModelParam {
00051 enum adjustedScoreMethods adjScoreMethod;
00053 double qryNoise;
00054
00056 enum QueryUpdateMethod fbMethod;
00058 double fbCoeff;
00060 int fbTermCount;
00062 double fbPrTh;
00064 double fbPrSumTh;
00066 double fbMixtureNoise;
00068 int emIterations;
00069 };
00070
00071 static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00072 static double defaultFBCoeff = 0.5;
00073 static int defaultFBTermCount =50;
00074 static double defaultFBPrTh = 0.001;
00075 static double defaultFBPrSumTh = 1;
00076 static double defaultFBMixNoise = 0.5;
00077 static int defaultEMIterations = 50;
00078 static double defaultQryNoise = 0;
00079 };
00080
00081
00083
00096 class SimpleKLDocModel : public DocumentRep {
00097 public:
00098 SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00099 ~SimpleKLDocModel() {};
00100
00102 virtual double termWeight(int termID, DocInfo *info) {
00103 double sp = seenProb(info->termCount(), termID);
00104 double usp = unseenCoeff();
00105 double ref = refLM.prob(termID);
00106 double score = sp/(usp*ref);
00107
00108
00109
00110
00111 return score;
00112 }
00113
00115 virtual double scoreConstant() {
00116 return unseenCoeff();
00117 }
00118
00120 virtual double unseenCoeff()=0;
00122
00123
00124 protected:
00125 UnigramLM &refLM;
00126 };
00127
00128
00129
00131
00139 class JelinekMercerDocModel : public SimpleKLDocModel {
00140 public:
00141 JelinekMercerDocModel(int docID,
00142 Index *referenceIndex,
00143 UnigramLM &collectLM,
00144 double *docProbMass,
00145 double collectLMWeight,
00146 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00147 SimpleKLDocModel(docID, collectLM),
00148 refIndex(referenceIndex),
00149 docPrMass(docProbMass),
00150 lambda(collectLMWeight),
00151 strategy(smthStrategy) {
00152 };
00153
00154 virtual ~JelinekMercerDocModel() {};
00155
00156 virtual double unseenCoeff() {
00157 if (strategy == SimpleKLParameter::INTERPOLATE) {
00158 return lambda;
00159 } else if (strategy==SimpleKLParameter::BACKOFF) {
00160 return lambda/(1-docPrMass[id]);
00161 } else {
00162 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00163 }
00164 }
00165 virtual double seenProb(double termFreq, int termID) {
00166 if (strategy == SimpleKLParameter::INTERPOLATE) {
00167 return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00168 lambda*refLM.prob(termID));
00169 } else if (strategy == SimpleKLParameter::BACKOFF) {
00170 return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00171 } else {
00172 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00173 }
00174 }
00175 private:
00176 Index *refIndex;
00177 double *docPrMass;
00178 double lambda;
00179 SimpleKLParameter::SmoothStrategy strategy;
00180 };
00181
00183
00188 class DirichletPriorDocModel : public SimpleKLDocModel {
00189 public:
00190 DirichletPriorDocModel(int docID,
00191 Index *referenceIndex,
00192 UnigramLM &collectLM,
00193 double *docProbMass,
00194 double priorWordCount,
00195 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00196 SimpleKLDocModel(docID, collectLM),
00197 refIndex(referenceIndex),
00198 docPrMass(docProbMass),
00199 mu(priorWordCount),
00200 strategy(smthStrategy) {
00201 };
00202
00203 virtual ~DirichletPriorDocModel() {};
00204
00205 virtual double unseenCoeff() {
00206
00207 if (strategy == SimpleKLParameter::INTERPOLATE) {
00208 return mu/(mu+refIndex->docLength(id));
00209 } else if (strategy==SimpleKLParameter::BACKOFF) {
00210 return (mu/((mu+refIndex->docLength(id))*
00211 (1-docPrMass[id])));
00212 } else {
00213 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00214 }
00215 }
00216
00217 virtual double seenProb(double termFreq, int termID) {
00218 if (strategy == SimpleKLParameter::INTERPOLATE) {
00219 return (termFreq+mu*refLM.prob(termID))/
00220 (double)(refIndex->docLength(id)+mu);
00221 } else if (strategy == SimpleKLParameter::BACKOFF) {
00222 return (termFreq/
00223 (double)(refIndex->docLength(id)+mu));
00224 } else {
00225 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00226 }
00227 }
00228 private:
00229 Index *refIndex;
00230 double *docPrMass;
00231 double mu;
00232 SimpleKLParameter::SmoothStrategy strategy;
00233 };
00234
00236
00243 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00244 public:
00245 AbsoluteDiscountDocModel(int docID,
00246 Index *referenceIndex,
00247 UnigramLM &collectLM,
00248 double *docProbMass,
00249 int *uniqueTermCount,
00250 double discount,
00251 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00252 SimpleKLDocModel(docID, collectLM),
00253 refIndex(referenceIndex),
00254 docPrMass(docProbMass),
00255 uniqDocLen(uniqueTermCount),
00256 delta(discount),
00257 strategy(smthStrategy) {
00258 };
00259
00260 virtual ~AbsoluteDiscountDocModel() {};
00261
00262 virtual double unseenCoeff() {
00263
00264 if (strategy == SimpleKLParameter::INTERPOLATE) {
00265 return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00266 } else if (strategy==SimpleKLParameter::BACKOFF) {
00267 return (delta*uniqDocLen[id]/
00268 (refIndex->docLength(id)*(1-docPrMass[id])));
00269 } else {
00270 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00271 }
00272 }
00273 virtual double seenProb(double termFreq, int termID) {
00274 if (strategy == SimpleKLParameter::INTERPOLATE) {
00275 return ((termFreq-delta)/(double)refIndex->docLength(id)+
00276 delta*uniqDocLen[id]*refLM.prob(termID)/
00277 (double)refIndex->docLength(id));
00278 } else if (strategy == SimpleKLParameter::BACKOFF) {
00279 return ((termFreq-delta)/(double)refIndex->docLength(id));
00280 } else {
00281 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00282 }
00283 }
00284 private:
00285 Index *refIndex;
00286 double *collectPr;
00287 double *docPrMass;
00288 int *uniqDocLen;
00289 double delta;
00290 SimpleKLParameter::SmoothStrategy strategy;
00291 };
00292
00293
00294
00296
00297
00298 class TwoStageDocModel : public SimpleKLDocModel {
00299 public:
00300 TwoStageDocModel(int docID,
00301 Index *referenceIndex,
00302 UnigramLM &collectLM,
00303 double *docProbMass,
00304 double firstStageMu,
00305 double secondStageLambda,
00306 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00307 SimpleKLDocModel(docID, collectLM),
00308 refIndex(referenceIndex),
00309 docPrMass(docProbMass),
00310 mu(firstStageMu),
00311 lambda(secondStageLambda),
00312 strategy(smthStrategy) {
00313 };
00314
00315 virtual ~TwoStageDocModel() {};
00316
00317 virtual double unseenCoeff() {
00318
00319 if (strategy == SimpleKLParameter::INTERPOLATE) {
00320 return (mu+lambda*refIndex->docLength(id))/(mu+refIndex->docLength(id));
00321 } else if (strategy == SimpleKLParameter::BACKOFF) {
00322 return ((mu+lambda*refIndex->docLength(id))
00323 /((mu+refIndex->docLength(id))*
00324 (1-docPrMass[id])));
00325 } else {
00326 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00327 }
00328 }
00329
00330 virtual double seenProb(double termFreq, int termID) {
00331 if (strategy == SimpleKLParameter::INTERPOLATE) {
00332 return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00333 (double)(refIndex->docLength(id)+mu)
00334 + lambda*refLM.prob(termID));
00335 } else if (strategy == SimpleKLParameter::BACKOFF) {
00336 return (termFreq*(1-lambda)/
00337 (double)(refIndex->docLength(id)+mu));
00338 } else {
00339 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00340 }
00341 }
00342 private:
00343 Index *refIndex;
00344 double *docPrMass;
00345 double mu;
00346 double lambda;
00347 SimpleKLParameter::SmoothStrategy strategy;
00348 };
00349
00350 #endif
00351
00352
00353
00354
00355