00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017
00018
00020 namespace SimpleKLParameter {
00021 enum SmoothMethod {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, TWOSTAGE=3};
00022
00023 enum SmoothStrategy {INTERPOLATE=0, BACKOFF=1};
00024
00025 enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00026
00027 struct DocSmoothParam {
00029 enum SmoothMethod smthMethod;
00031 enum SmoothStrategy smthStrategy;
00033 double ADDelta;
00035 double JMLambda;
00037 double DirPrior;
00038 };
00039
00040 static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00041 static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00042 static double defaultADDelta = 0.7;
00043 static double defaultJMLambda = 0.5;
00044 static double defaultDirPrior = 1000;
00045
00046 struct QueryModelParam {
00048 double qryNoise;
00049
00051 enum QueryUpdateMethod fbMethod;
00053 double fbCoeff;
00055 int fbTermCount;
00057 double fbPrTh;
00059 double fbPrSumTh;
00061 double fbMixtureNoise;
00063 int emIterations;
00064 };
00065
00066 static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00067 static double defaultFBCoeff = 0.5;
00068 static int defaultFBTermCount =50;
00069 static double defaultFBPrTh = 0.001;
00070 static double defaultFBPrSumTh = 1;
00071 static double defaultFBMixNoise = 0.5;
00072 static int defaultEMIterations = 50;
00073 static double defaultQryNoise = 0;
00074 };
00075
00076
00078
00091 class SimpleKLDocModel : public DocumentRep {
00092 public:
00093 SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00094 ~SimpleKLDocModel() {};
00095
00097 virtual double termWeight(int termID, DocInfo *info) {
00098 return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00099 }
00100
00102 virtual double scoreConstant() {
00103 return unseenCoeff();
00104 }
00105
00107 virtual double unseenCoeff()=0;
00109
00110
00111 protected:
00112 UnigramLM &refLM;
00113 };
00114
00115
00116
00118
00126 class JelinekMercerDocModel : public SimpleKLDocModel {
00127 public:
00128 JelinekMercerDocModel(int docID,
00129 Index *referenceIndex,
00130 UnigramLM &collectLM,
00131 double *docProbMass,
00132 double collectLMWeight,
00133 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00134 SimpleKLDocModel(docID, collectLM),
00135 refIndex(referenceIndex),
00136 docPrMass(docProbMass),
00137 lambda(collectLMWeight),
00138 strategy(smthStrategy) {
00139 };
00140
00141 virtual ~JelinekMercerDocModel() {};
00142
00143 virtual double unseenCoeff() {
00144 if (strategy == SimpleKLParameter::INTERPOLATE) {
00145 return lambda;
00146 } else if (strategy==SimpleKLParameter::BACKOFF) {
00147 return lambda/(1-docPrMass[id]);
00148 } else {
00149 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00150 }
00151 }
00152 virtual double seenProb(double termFreq, int termID) {
00153 if (strategy == SimpleKLParameter::INTERPOLATE) {
00154 return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00155 lambda*refLM.prob(termID));
00156 } else if (strategy == SimpleKLParameter::BACKOFF) {
00157 return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00158 } else {
00159 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00160 }
00161 }
00162 private:
00163 Index *refIndex;
00164 double *docPrMass;
00165 double lambda;
00166 SimpleKLParameter::SmoothStrategy strategy;
00167 };
00168
00170
00175 class DirichletPriorDocModel : public SimpleKLDocModel {
00176 public:
00177 DirichletPriorDocModel(int docID,
00178 Index *referenceIndex,
00179 UnigramLM &collectLM,
00180 double *docProbMass,
00181 double priorWordCount,
00182 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00183 SimpleKLDocModel(docID, collectLM),
00184 refIndex(referenceIndex),
00185 docPrMass(docProbMass),
00186 mu(priorWordCount),
00187 strategy(smthStrategy) {
00188 };
00189
00190 virtual ~DirichletPriorDocModel() {};
00191
00192 virtual double unseenCoeff() {
00193
00194 if (strategy == SimpleKLParameter::INTERPOLATE) {
00195 return mu/(mu+refIndex->docLength(id));
00196 } else if (strategy==SimpleKLParameter::BACKOFF) {
00197 return (mu/((mu+refIndex->docLength(id))*
00198 (1-docPrMass[id])));
00199 } else {
00200 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00201 }
00202 }
00203
00204 virtual double seenProb(double termFreq, int termID) {
00205 if (strategy == SimpleKLParameter::INTERPOLATE) {
00206 return (termFreq+mu*refLM.prob(termID))/
00207 (double)(refIndex->docLength(id)+mu);
00208 } else if (strategy == SimpleKLParameter::BACKOFF) {
00209 return (termFreq/
00210 (double)(refIndex->docLength(id)+mu));
00211 } else {
00212 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00213 }
00214 }
00215 private:
00216 Index *refIndex;
00217 double *docPrMass;
00218 double mu;
00219 SimpleKLParameter::SmoothStrategy strategy;
00220 };
00221
00223
00230 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00231 public:
00232 AbsoluteDiscountDocModel(int docID,
00233 Index *referenceIndex,
00234 UnigramLM &collectLM,
00235 double *docProbMass,
00236 int *uniqueTermCount,
00237 double discount,
00238 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00239 SimpleKLDocModel(docID, collectLM),
00240 refIndex(referenceIndex),
00241 docPrMass(docProbMass),
00242 uniqDocLen(uniqueTermCount),
00243 delta(discount),
00244 strategy(smthStrategy) {
00245 };
00246
00247 virtual ~AbsoluteDiscountDocModel() {};
00248
00249 virtual double unseenCoeff() {
00250
00251 if (strategy == SimpleKLParameter::INTERPOLATE) {
00252 return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00253 } else if (strategy==SimpleKLParameter::BACKOFF) {
00254 return (delta*uniqDocLen[id]/
00255 (refIndex->docLength(id)*(1-docPrMass[id])));
00256 } else {
00257 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00258 }
00259 }
00260 virtual double seenProb(double termFreq, int termID) {
00261 if (strategy == SimpleKLParameter::INTERPOLATE) {
00262 return ((termFreq-delta)/(double)refIndex->docLength(id)+
00263 delta*uniqDocLen[id]*refLM.prob(termID)/
00264 (double)refIndex->docLength(id));
00265 } else if (strategy == SimpleKLParameter::BACKOFF) {
00266 return ((termFreq-delta)/(double)refIndex->docLength(id));
00267 } else {
00268 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00269 }
00270 }
00271 private:
00272 Index *refIndex;
00273 double *collectPr;
00274 double *docPrMass;
00275 int *uniqDocLen;
00276 double delta;
00277 SimpleKLParameter::SmoothStrategy strategy;
00278 };
00279
00280
00281
00283
00284
00285 class TwoStageDocModel : public SimpleKLDocModel {
00286 public:
00287 TwoStageDocModel(int docID,
00288 Index *referenceIndex,
00289 UnigramLM &collectLM,
00290 double *docProbMass,
00291 double firstStageMu,
00292 double secondStageLambda,
00293 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00294 SimpleKLDocModel(docID, collectLM),
00295 refIndex(referenceIndex),
00296 docPrMass(docProbMass),
00297 mu(firstStageMu),
00298 lambda(secondStageLambda),
00299 strategy(smthStrategy) {
00300 };
00301
00302 virtual ~TwoStageDocModel() {};
00303
00304 virtual double unseenCoeff() {
00305
00306 if (strategy == SimpleKLParameter::INTERPOLATE) {
00307 return (mu+lambda*refIndex->docLength(id))/(mu+refIndex->docLength(id));
00308 } else if (strategy == SimpleKLParameter::BACKOFF) {
00309 return ((mu+lambda*refIndex->docLength(id))
00310 /((mu+refIndex->docLength(id))*
00311 (1-docPrMass[id])));
00312 } else {
00313 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00314 }
00315 }
00316
00317 virtual double seenProb(double termFreq, int termID) {
00318 if (strategy == SimpleKLParameter::INTERPOLATE) {
00319 return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00320 (double)(refIndex->docLength(id)+mu)
00321 + lambda*refLM.prob(termID));
00322 } else if (strategy == SimpleKLParameter::BACKOFF) {
00323 return (termFreq*(1-lambda)/
00324 (double)(refIndex->docLength(id)+mu));
00325 } else {
00326 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00327 }
00328 }
00329 private:
00330 Index *refIndex;
00331 double *docPrMass;
00332 double mu;
00333 double lambda;
00334 SimpleKLParameter::SmoothStrategy strategy;
00335 };
00336
00337 #endif
00338
00339
00340
00341
00342