00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _SIMPLEKLDOCMODEL_HPP
00014 #define _SIMPLEKLDOCMODEL_HPP
00015
00016 #include "DocumentRep.hpp"
00017 #include "Index.hpp"
00018 #include "UnigramLM.hpp"
00019
00020
00022 namespace SimpleKLParameter {
00023 enum SmoothMethod {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, TWOSTAGE=3};
00024
00025 enum SmoothStrategy {INTERPOLATE=0, BACKOFF=1};
00026
00027 enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2};
00028
00029 struct DocSmoothParam {
00031 enum SmoothMethod smthMethod;
00033 enum SmoothStrategy smthStrategy;
00035 double ADDelta;
00037 double JMLambda;
00039 double DirPrior;
00040 };
00041
00042 static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00043 static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00044 static double defaultADDelta = 0.7;
00045 static double defaultJMLambda = 0.5;
00046 static double defaultDirPrior = 1000;
00047
00048 struct QueryModelParam {
00050 double qryNoise;
00051
00053 enum QueryUpdateMethod fbMethod;
00055 double fbCoeff;
00057 int fbTermCount;
00059 double fbPrTh;
00061 double fbPrSumTh;
00063 double fbMixtureNoise;
00065 int emIterations;
00066 };
00067
00068 static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00069 static double defaultFBCoeff = 0.5;
00070 static int defaultFBTermCount =50;
00071 static double defaultFBPrTh = 0.001;
00072 static double defaultFBPrSumTh = 1;
00073 static double defaultFBMixNoise = 0.5;
00074 static int defaultEMIterations = 50;
00075 static double defaultQryNoise = 0;
00076 };
00077
00078
00080
00093 class SimpleKLDocModel : public DocumentRep {
00094 public:
00095 SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00096 ~SimpleKLDocModel() {};
00097
00099 virtual double termWeight(int termID, DocInfo *info) {
00100 return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00101 }
00102
00104 virtual double scoreConstant() {
00105 return unseenCoeff();
00106 }
00107
00109 virtual double unseenCoeff()=0;
00111
00112 protected:
00113 UnigramLM &refLM;
00114 };
00115
00116
00117
00119
00127 class JelinekMercerDocModel : public SimpleKLDocModel {
00128 public:
00129 JelinekMercerDocModel(int docID,
00130 Index *referenceIndex,
00131 UnigramLM &collectLM,
00132 double *docProbMass,
00133 double collectLMWeight,
00134 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00135 SimpleKLDocModel(docID, collectLM),
00136 refIndex(referenceIndex),
00137 docPrMass(docProbMass),
00138 lambda(collectLMWeight),
00139 strategy(smthStrategy) {
00140 };
00141
00142 virtual ~JelinekMercerDocModel() {};
00143
00144 virtual double unseenCoeff() {
00145 if (strategy == SimpleKLParameter::INTERPOLATE) {
00146 return lambda;
00147 } else if (strategy==SimpleKLParameter::BACKOFF) {
00148 return lambda/(1-docPrMass[id]);
00149 } else {
00150 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00151 }
00152 }
00153 virtual double seenProb(double termFreq, int termID) {
00154 if (strategy == SimpleKLParameter::INTERPOLATE) {
00155 return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00156 lambda*refLM.prob(termID));
00157 } else if (strategy == SimpleKLParameter::BACKOFF) {
00158 return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00159 } else {
00160 throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00161 }
00162 }
00163 private:
00164 Index *refIndex;
00165 double *docPrMass;
00166 double lambda;
00167 SimpleKLParameter::SmoothStrategy strategy;
00168 };
00169
00171
00176 class DirichletPriorDocModel : public SimpleKLDocModel {
00177 public:
00178 DirichletPriorDocModel(int docID,
00179 Index *referenceIndex,
00180 UnigramLM &collectLM,
00181 double *docProbMass,
00182 double priorWordCount,
00183 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00184 SimpleKLDocModel(docID, collectLM),
00185 refIndex(referenceIndex),
00186 docPrMass(docProbMass),
00187 mu(priorWordCount),
00188 strategy(smthStrategy) {
00189 };
00190
00191 virtual ~DirichletPriorDocModel() {};
00192
00193 virtual double unseenCoeff() {
00194
00195 if (strategy == SimpleKLParameter::INTERPOLATE) {
00196 return mu/(mu+refIndex->docLength(id));
00197 } else if (strategy==SimpleKLParameter::BACKOFF) {
00198 return (mu/((mu+refIndex->docLength(id))*
00199 (1-docPrMass[id])));
00200 } else {
00201 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00202 }
00203 }
00204
00205 virtual double seenProb(double termFreq, int termID) {
00206 if (strategy == SimpleKLParameter::INTERPOLATE) {
00207 return (termFreq+mu*refLM.prob(termID))/
00208 (double)(refIndex->docLength(id)+mu);
00209 } else if (strategy == SimpleKLParameter::BACKOFF) {
00210 return (termFreq/
00211 (double)(refIndex->docLength(id)+mu));
00212 } else {
00213 throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00214 }
00215 }
00216 private:
00217 Index *refIndex;
00218 double *docPrMass;
00219 double mu;
00220 SimpleKLParameter::SmoothStrategy strategy;
00221 };
00222
00224
00231 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00232 public:
00233 AbsoluteDiscountDocModel(int docID,
00234 Index *referenceIndex,
00235 UnigramLM &collectLM,
00236 double *docProbMass,
00237 int *uniqueTermCount,
00238 double discount,
00239 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00240 SimpleKLDocModel(docID, collectLM),
00241 refIndex(referenceIndex),
00242 docPrMass(docProbMass),
00243 uniqDocLen(uniqueTermCount),
00244 delta(discount),
00245 strategy(smthStrategy) {
00246 };
00247
00248 virtual ~AbsoluteDiscountDocModel() {};
00249
00250 virtual double unseenCoeff() {
00251
00252 if (strategy == SimpleKLParameter::INTERPOLATE) {
00253 return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00254 } else if (strategy==SimpleKLParameter::BACKOFF) {
00255 return (delta*uniqDocLen[id]/
00256 (refIndex->docLength(id)*(1-docPrMass[id])));
00257 } else {
00258 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00259 }
00260 }
00261 virtual double seenProb(double termFreq, int termID) {
00262 if (strategy == SimpleKLParameter::INTERPOLATE) {
00263 return ((termFreq-delta)/(double)refIndex->docLength(id)+
00264 delta*uniqDocLen[id]*refLM.prob(termID)/
00265 (double)refIndex->docLength(id));
00266 } else if (strategy == SimpleKLParameter::BACKOFF) {
00267 return ((termFreq-delta)/(double)refIndex->docLength(id));
00268 } else {
00269 throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00270 }
00271 }
00272 private:
00273 Index *refIndex;
00274 double *collectPr;
00275 double *docPrMass;
00276 int *uniqDocLen;
00277 double delta;
00278 SimpleKLParameter::SmoothStrategy strategy;
00279 };
00280
00281
00282
00284
00285
00286 class TwoStageDocModel : public SimpleKLDocModel {
00287 public:
00288 TwoStageDocModel(int docID,
00289 Index *referenceIndex,
00290 UnigramLM &collectLM,
00291 double *docProbMass,
00292 double firstStageMu,
00293 double secondStageLambda,
00294 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00295 SimpleKLDocModel(docID, collectLM),
00296 refIndex(referenceIndex),
00297 docPrMass(docProbMass),
00298 mu(firstStageMu),
00299 lambda(secondStageLambda),
00300 strategy(smthStrategy) {
00301 };
00302
00303 virtual ~TwoStageDocModel() {};
00304
00305 virtual double unseenCoeff() {
00306
00307 if (strategy == SimpleKLParameter::INTERPOLATE) {
00308 return (mu+lambda*refIndex->docLength(id))/(mu+refIndex->docLength(id));
00309 } else if (strategy == SimpleKLParameter::BACKOFF) {
00310 return ((mu+lambda*refIndex->docLength(id))
00311 /((mu+refIndex->docLength(id))*
00312 (1-docPrMass[id])));
00313 } else {
00314 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00315 }
00316 }
00317
00318 virtual double seenProb(double termFreq, int termID) {
00319 if (strategy == SimpleKLParameter::INTERPOLATE) {
00320 return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00321 (double)(refIndex->docLength(id)+mu)
00322 + lambda*refLM.prob(termID));
00323 } else if (strategy == SimpleKLParameter::BACKOFF) {
00324 return (termFreq*(1-lambda)/
00325 (double)(refIndex->docLength(id)+mu));
00326 } else {
00327 throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00328 }
00329 }
00330 private:
00331 Index *refIndex;
00332 double *docPrMass;
00333 double mu;
00334 double lambda;
00335 SimpleKLParameter::SmoothStrategy strategy;
00336 };
00337
00338 #endif
00339
00340
00341
00342
00343