Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

SimpleKLDocModel.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013 
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017 
00018 
00020 namespace SimpleKLParameter {
00021   enum SmoothMethod  {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, TWOSTAGE=3};
00022  
00023   enum SmoothStrategy  {INTERPOLATE=0, BACKOFF=1};
00024 
00025   enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00026 
00027   struct DocSmoothParam {
00029     enum SmoothMethod smthMethod;
00031     enum SmoothStrategy smthStrategy;
00033     double ADDelta;
00035     double JMLambda;
00037     double DirPrior;
00038   };
00039 
00040   static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00041   static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00042   static double defaultADDelta = 0.7;
00043   static double defaultJMLambda = 0.5;
00044   static double defaultDirPrior = 1000;
00045 
00046   struct QueryModelParam {
00048     double qryNoise;
00049 
00051     enum QueryUpdateMethod fbMethod;
00053     double fbCoeff;
00055     int fbTermCount;
00057     double fbPrTh;
00059     double fbPrSumTh;
00061     double fbMixtureNoise;
00063     int emIterations;
00064   };
00065 
00066   static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00067   static double defaultFBCoeff = 0.5;
00068   static int defaultFBTermCount =50;
00069   static double defaultFBPrTh = 0.001;
00070   static double defaultFBPrSumTh = 1;
00071   static double defaultFBMixNoise = 0.5;
00072   static int defaultEMIterations = 50;
00073   static double defaultQryNoise = 0; //maximum likelihood estimator
00074 };
00075 
00076 
00078 
00091 class SimpleKLDocModel : public DocumentRep {
00092 public:
00093   SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00094   ~SimpleKLDocModel() {};
00095 
00097   virtual double termWeight(int termID, DocInfo *info) {
00098     return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00099   }
00100 
00102   virtual double scoreConstant() {
00103     return unseenCoeff();
00104   }
00105 
00107   virtual double unseenCoeff()=0; // a(d)
00109   virtual double seenProb(double termFreq, int termID)=0; // p(w|d), w seen
00110 
00111 protected:
00112   UnigramLM &refLM;
00113 };
00114 
00115 
00116 
00118 
00126 class JelinekMercerDocModel : public SimpleKLDocModel {
00127 public:
00128   JelinekMercerDocModel(int docID, 
00129                         Index *referenceIndex, 
00130                         UnigramLM &collectLM,
00131                         double *docProbMass,
00132                         double collectLMWeight, 
00133                         SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00134     SimpleKLDocModel(docID, collectLM),
00135     refIndex(referenceIndex),
00136     docPrMass(docProbMass),
00137     lambda(collectLMWeight), 
00138     strategy(smthStrategy) {
00139   };
00140 
00141   virtual ~JelinekMercerDocModel() {};
00142   
00143   virtual double unseenCoeff() {
00144     if (strategy == SimpleKLParameter::INTERPOLATE) {
00145       return lambda;
00146     } else if (strategy==SimpleKLParameter::BACKOFF) {
00147       return lambda/(1-docPrMass[id]);
00148     } else {
00149       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00150     }
00151   }
00152   virtual double seenProb(double termFreq, int termID) {
00153     if (strategy == SimpleKLParameter::INTERPOLATE) {
00154       return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00155               lambda*refLM.prob(termID));
00156     } else if (strategy == SimpleKLParameter::BACKOFF) {
00157       return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00158     } else {
00159       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00160     }
00161   }
00162 private:
00163   Index *refIndex;
00164   double *docPrMass;
00165   double lambda;
00166   SimpleKLParameter::SmoothStrategy strategy;
00167 };
00168 
00170 
00175 class DirichletPriorDocModel : public SimpleKLDocModel {
00176 public:
00177   DirichletPriorDocModel(int docID,
00178                    Index *referenceIndex, 
00179                    UnigramLM &collectLM,
00180                    double *docProbMass,
00181                    double priorWordCount, 
00182                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00183     SimpleKLDocModel(docID, collectLM),
00184     refIndex(referenceIndex),
00185     docPrMass(docProbMass),
00186     mu(priorWordCount),
00187             strategy(smthStrategy) {
00188   };
00189 
00190   virtual ~DirichletPriorDocModel() {};
00191 
00192   virtual double unseenCoeff() {
00193 
00194     if (strategy == SimpleKLParameter::INTERPOLATE) {
00195       return mu/(mu+refIndex->docLength(id));
00196     } else if (strategy==SimpleKLParameter::BACKOFF) {
00197       return (mu/((mu+refIndex->docLength(id))*
00198                       (1-docPrMass[id])));
00199     } else {
00200       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00201     }
00202   }
00203 
00204   virtual double seenProb(double termFreq, int termID) {
00205     if (strategy == SimpleKLParameter::INTERPOLATE) {
00206       return (termFreq+mu*refLM.prob(termID))/
00207         (double)(refIndex->docLength(id)+mu);
00208     } else if (strategy == SimpleKLParameter::BACKOFF) {
00209       return (termFreq/
00210               (double)(refIndex->docLength(id)+mu));
00211     } else {      
00212       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00213     }
00214   }
00215 private:
00216   Index *refIndex;
00217   double *docPrMass;
00218   double mu;
00219   SimpleKLParameter::SmoothStrategy strategy;
00220 };
00221 
00223 
00230 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00231 public:
00232   AbsoluteDiscountDocModel(int docID,
00233                            Index *referenceIndex, 
00234                            UnigramLM &collectLM,
00235                            double *docProbMass,
00236                            int *uniqueTermCount,
00237                            double discount,
00238                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00239     SimpleKLDocModel(docID, collectLM),
00240     refIndex(referenceIndex),
00241     docPrMass(docProbMass),
00242     uniqDocLen(uniqueTermCount),
00243     delta(discount),
00244       strategy(smthStrategy) {
00245   };
00246 
00247   virtual ~AbsoluteDiscountDocModel() {};
00248   
00249   virtual double unseenCoeff() {
00250 
00251     if (strategy == SimpleKLParameter::INTERPOLATE) {
00252       return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00253     } else if (strategy==SimpleKLParameter::BACKOFF) {
00254       return (delta*uniqDocLen[id]/
00255               (refIndex->docLength(id)*(1-docPrMass[id])));
00256     } else {
00257       throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00258     }
00259   }
00260   virtual double seenProb(double termFreq, int termID) {
00261     if (strategy == SimpleKLParameter::INTERPOLATE) {
00262       return ((termFreq-delta)/(double)refIndex->docLength(id)+
00263               delta*uniqDocLen[id]*refLM.prob(termID)/
00264               (double)refIndex->docLength(id));
00265     } else if (strategy == SimpleKLParameter::BACKOFF) {
00266       return ((termFreq-delta)/(double)refIndex->docLength(id));
00267     } else {
00268             throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00269     }
00270   }
00271 private:
00272   Index *refIndex;
00273   double *collectPr;
00274   double *docPrMass;
00275   int *uniqDocLen;
00276   double delta;
00277   SimpleKLParameter::SmoothStrategy strategy;
00278 };
00279 
00280 
00281 
00283 // alpha = (mu+lambda*dLength)/(dLength+mu)
00284 // pseen(w) = [(1-lambda)*c(w;d)+ (mu+lambda*dLength)*Pc(w)]/(dLength + mu)
00285 class TwoStageDocModel : public SimpleKLDocModel {
00286 public:
00287   TwoStageDocModel(int docID,
00288                    Index *referenceIndex, 
00289                    UnigramLM &collectLM,
00290                    double *docProbMass,
00291                    double firstStageMu, 
00292                    double secondStageLambda, 
00293                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00294     SimpleKLDocModel(docID, collectLM),
00295     refIndex(referenceIndex),
00296     docPrMass(docProbMass),
00297     mu(firstStageMu),
00298     lambda(secondStageLambda),
00299       strategy(smthStrategy) {
00300   };
00301 
00302   virtual ~TwoStageDocModel() {};
00303 
00304   virtual double unseenCoeff() {
00305 
00306     if (strategy == SimpleKLParameter::INTERPOLATE) {
00307       return (mu+lambda*refIndex->docLength(id))/(mu+refIndex->docLength(id));
00308     } else if (strategy == SimpleKLParameter::BACKOFF) {
00309       return ((mu+lambda*refIndex->docLength(id))
00310               /((mu+refIndex->docLength(id))*
00311                 (1-docPrMass[id])));
00312     } else {
00313             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00314     }
00315   }
00316 
00317   virtual double seenProb(double termFreq, int termID) {
00318     if (strategy == SimpleKLParameter::INTERPOLATE) {      
00319       return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00320               (double)(refIndex->docLength(id)+mu) 
00321               + lambda*refLM.prob(termID));
00322     } else if (strategy == SimpleKLParameter::BACKOFF) {
00323       return (termFreq*(1-lambda)/
00324               (double)(refIndex->docLength(id)+mu));
00325     } else {
00326             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00327     }
00328   }
00329 private:
00330   Index *refIndex;
00331   double *docPrMass;
00332   double mu;
00333   double lambda;
00334   SimpleKLParameter::SmoothStrategy strategy;
00335 };
00336 
00337 #endif /* _SIMPLEKLDOCMODEL_HPP */
00338 
00339 
00340 
00341 
00342 

Generated on Tue Nov 25 11:26:46 2003 for Lemur Toolkit by doxygen1.2.18