00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013 
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017 
00018 
00020 namespace SimpleKLParameter {
00021   enum SmoothMethod  {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, 
00022                       TWOSTAGE=3};
00023  
00024   enum SmoothStrategy  {INTERPOLATE=0, BACKOFF=1};
00025 
00026   enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00027 
00028   enum adjustedScoreMethods {QUERYLIKELIHOOD = 1, CROSSENTROPY = 2, 
00029                                NEGATIVEKLD = 3};
00030 
00031   struct DocSmoothParam {
00033     enum SmoothMethod smthMethod;
00035     enum SmoothStrategy smthStrategy;
00037     double ADDelta;
00039     double JMLambda;
00041     double DirPrior;
00042   };
00043 
00044   static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00045   static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00046   static double defaultADDelta = 0.7;
00047   static double defaultJMLambda = 0.5;
00048   static double defaultDirPrior = 1000;
00049 
00050   struct QueryModelParam {
00051     enum adjustedScoreMethods adjScoreMethod;
00053     double qryNoise;
00054 
00056     enum QueryUpdateMethod fbMethod;
00058     double fbCoeff;
00060     int fbTermCount;
00062     double fbPrTh;
00064     double fbPrSumTh;
00066     double fbMixtureNoise;
00068     int emIterations;
00069   };
00070 
00071   static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00072   static double defaultFBCoeff = 0.5;
00073   static int defaultFBTermCount =50;
00074   static double defaultFBPrTh = 0.001;
00075   static double defaultFBPrSumTh = 1;
00076   static double defaultFBMixNoise = 0.5;
00077   static int defaultEMIterations = 50;
00078   static double defaultQryNoise = 0; 
00079 };
00080 
00081 
00083 
00096 class SimpleKLDocModel : public DocumentRep {
00097 public:
00098   SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00099   ~SimpleKLDocModel() {};
00100 
00102   virtual double termWeight(int termID, DocInfo *info) {
00103     double sp = seenProb(info->termCount(), termID);
00104     double usp = unseenCoeff();
00105     double ref = refLM.prob(termID);
00106     double score = sp/(usp*ref);
00107     
00108 
00109 
00110     
00111     return score;
00112   }
00113 
00115   virtual double scoreConstant() {
00116     return unseenCoeff();
00117   }
00118 
00120   virtual double unseenCoeff()=0; 
00122 
00123 
00124 protected:
00125   UnigramLM &refLM;
00126 };
00127 
00128 
00129 
00131 
00139 class JelinekMercerDocModel : public SimpleKLDocModel {
00140 public:
00141   JelinekMercerDocModel(int docID, 
00142                         Index *referenceIndex, 
00143                         UnigramLM &collectLM,
00144                         double *docProbMass,
00145                         double collectLMWeight, 
00146                         SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00147     SimpleKLDocModel(docID, collectLM),
00148     refIndex(referenceIndex),
00149     docPrMass(docProbMass),
00150     lambda(collectLMWeight), 
00151     strategy(smthStrategy) {
00152   };
00153 
00154   virtual ~JelinekMercerDocModel() {};
00155   
00156   virtual double unseenCoeff() {
00157     if (strategy == SimpleKLParameter::INTERPOLATE) {
00158       return lambda;
00159     } else if (strategy==SimpleKLParameter::BACKOFF) {
00160       return lambda/(1-docPrMass[id]);
00161     } else {
00162       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00163     }
00164   }
00165   virtual double seenProb(double termFreq, int termID) {
00166     if (strategy == SimpleKLParameter::INTERPOLATE) {
00167       return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00168               lambda*refLM.prob(termID));
00169     } else if (strategy == SimpleKLParameter::BACKOFF) {
00170       return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00171     } else {
00172       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00173     }
00174   }
00175 private:
00176   Index *refIndex;
00177   double *docPrMass;
00178   double lambda;
00179   SimpleKLParameter::SmoothStrategy strategy;
00180 };
00181 
00183 
00188 class DirichletPriorDocModel : public SimpleKLDocModel {
00189 public:
00190   DirichletPriorDocModel(int docID,
00191                    Index *referenceIndex, 
00192                    UnigramLM &collectLM,
00193                    double *docProbMass,
00194                    double priorWordCount, 
00195                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00196     SimpleKLDocModel(docID, collectLM),
00197     refIndex(referenceIndex),
00198     docPrMass(docProbMass),
00199     mu(priorWordCount),
00200             strategy(smthStrategy) {
00201   };
00202 
00203   virtual ~DirichletPriorDocModel() {};
00204 
00205   virtual double unseenCoeff() {
00206 
00207     if (strategy == SimpleKLParameter::INTERPOLATE) {
00208       return mu/(mu+refIndex->docLength(id));
00209     } else if (strategy==SimpleKLParameter::BACKOFF) {
00210       return (mu/((mu+refIndex->docLength(id))*
00211                       (1-docPrMass[id])));
00212     } else {
00213       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00214     }
00215   }
00216 
00217   virtual double seenProb(double termFreq, int termID) {
00218     if (strategy == SimpleKLParameter::INTERPOLATE) {
00219       return (termFreq+mu*refLM.prob(termID))/
00220         (double)(refIndex->docLength(id)+mu);
00221     } else if (strategy == SimpleKLParameter::BACKOFF) {
00222       return (termFreq/
00223               (double)(refIndex->docLength(id)+mu));
00224     } else {      
00225       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00226     }
00227   }
00228 private:
00229   Index *refIndex;
00230   double *docPrMass;
00231   double mu;
00232   SimpleKLParameter::SmoothStrategy strategy;
00233 };
00234 
00236 
00243 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00244 public:
00245   AbsoluteDiscountDocModel(int docID,
00246                            Index *referenceIndex, 
00247                            UnigramLM &collectLM,
00248                            double *docProbMass,
00249                            int *uniqueTermCount,
00250                            double discount,
00251                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00252     SimpleKLDocModel(docID, collectLM),
00253     refIndex(referenceIndex),
00254     docPrMass(docProbMass),
00255     uniqDocLen(uniqueTermCount),
00256     delta(discount),
00257       strategy(smthStrategy) {
00258   };
00259 
00260   virtual ~AbsoluteDiscountDocModel() {};
00261   
00262   virtual double unseenCoeff() {
00263 
00264     if (strategy == SimpleKLParameter::INTERPOLATE) {
00265       return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00266     } else if (strategy==SimpleKLParameter::BACKOFF) {
00267       return (delta*uniqDocLen[id]/
00268               (refIndex->docLength(id)*(1-docPrMass[id])));
00269     } else {
00270       throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00271     }
00272   }
00273   virtual double seenProb(double termFreq, int termID) {
00274     if (strategy == SimpleKLParameter::INTERPOLATE) {
00275       return ((termFreq-delta)/(double)refIndex->docLength(id)+
00276               delta*uniqDocLen[id]*refLM.prob(termID)/
00277               (double)refIndex->docLength(id));
00278     } else if (strategy == SimpleKLParameter::BACKOFF) {
00279       return ((termFreq-delta)/(double)refIndex->docLength(id));
00280     } else {
00281             throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00282     }
00283   }
00284 private:
00285   Index *refIndex;
00286   double *collectPr;
00287   double *docPrMass;
00288   int *uniqDocLen;
00289   double delta;
00290   SimpleKLParameter::SmoothStrategy strategy;
00291 };
00292 
00293 
00294 
00296 
00297 
00298 class TwoStageDocModel : public SimpleKLDocModel {
00299 public:
00300   TwoStageDocModel(int docID,
00301                    Index *referenceIndex, 
00302                    UnigramLM &collectLM,
00303                    double *docProbMass,
00304                    double firstStageMu, 
00305                    double secondStageLambda, 
00306                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00307     SimpleKLDocModel(docID, collectLM),
00308     refIndex(referenceIndex),
00309     docPrMass(docProbMass),
00310     mu(firstStageMu),
00311     lambda(secondStageLambda),
00312       strategy(smthStrategy) {
00313   };
00314 
00315   virtual ~TwoStageDocModel() {};
00316 
00317   virtual double unseenCoeff() {
00318 
00319     if (strategy == SimpleKLParameter::INTERPOLATE) {
00320       return (mu+lambda*refIndex->docLength(id))/(mu+refIndex->docLength(id));
00321     } else if (strategy == SimpleKLParameter::BACKOFF) {
00322       return ((mu+lambda*refIndex->docLength(id))
00323               /((mu+refIndex->docLength(id))*
00324                 (1-docPrMass[id])));
00325     } else {
00326             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00327     }
00328   }
00329 
00330   virtual double seenProb(double termFreq, int termID) {
00331     if (strategy == SimpleKLParameter::INTERPOLATE) {      
00332       return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00333               (double)(refIndex->docLength(id)+mu) 
00334               + lambda*refLM.prob(termID));
00335     } else if (strategy == SimpleKLParameter::BACKOFF) {
00336       return (termFreq*(1-lambda)/
00337               (double)(refIndex->docLength(id)+mu));
00338     } else {
00339             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00340     }
00341   }
00342 private:
00343   Index *refIndex;
00344   double *docPrMass;
00345   double mu;
00346   double lambda;
00347   SimpleKLParameter::SmoothStrategy strategy;
00348 };
00349 
00350 #endif 
00351 
00352 
00353 
00354 
00355