Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

SimpleKLDocModel.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013 
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017 
00018 
00020 namespace SimpleKLParameter {
00021   enum SmoothMethod  {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, 
00022                       TWOSTAGE=3};
00023  
00024   enum SmoothStrategy  {INTERPOLATE=0, BACKOFF=1};
00025 
00026   enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00027 
00028   enum adjustedScoreMethods {QUERYLIKELIHOOD = 1, CROSSENTROPY = 2, 
00029                                NEGATIVEKLD = 3};
00030 
00031   struct DocSmoothParam {
00033     enum SmoothMethod smthMethod;
00035     enum SmoothStrategy smthStrategy;
00037     double ADDelta;
00039     double JMLambda;
00041     double DirPrior;
00042   };
00043 
00044   static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00045   static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00046   static double defaultADDelta = 0.7;
00047   static double defaultJMLambda = 0.5;
00048   static double defaultDirPrior = 1000;
00049 
00050   struct QueryModelParam {
00051     enum adjustedScoreMethods adjScoreMethod;
00053     double qryNoise;
00054 
00056     enum QueryUpdateMethod fbMethod;
00058     double fbCoeff;
00060     int fbTermCount;
00062     double fbPrTh;
00064     double fbPrSumTh;
00066     double fbMixtureNoise;
00068     int emIterations;
00069   };
00070 
00071   static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00072   static double defaultFBCoeff = 0.5;
00073   static int defaultFBTermCount =50;
00074   static double defaultFBPrTh = 0.001;
00075   static double defaultFBPrSumTh = 1;
00076   static double defaultFBMixNoise = 0.5;
00077   static int defaultEMIterations = 50;
00078   static double defaultQryNoise = 0; //maximum likelihood estimator
00079 };
00080 
00081 
00083 
00096 class SimpleKLDocModel : public DocumentRep {
00097 public:
00098   SimpleKLDocModel(DOCID_T docID, const UnigramLM &collectLM, int dl = 1, 
00099                    const double *prMass = NULL,
00100                    SimpleKLParameter::SmoothStrategy strat = SimpleKLParameter::INTERPOLATE) : 
00101     DocumentRep(docID, dl), 
00102     refLM(collectLM), docPrMass(prMass), strategy(strat) {
00103   };
00104   
00105   ~SimpleKLDocModel() {};
00106 
00108   virtual double termWeight(TERMID_T termID, const DocInfo *info) const {
00109     double sp = seenProb(info->termCount(), termID);
00110     double usp = unseenCoeff();
00111     double ref = refLM.prob(termID);
00112     double score = sp/(usp*ref);
00113     /*
00114     cerr << "TW:" << termID << " sp:" << sp << " usp:" << usp << " ref:" << ref << " s:" << score << endl;
00115     */
00116     //    return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00117     return score;
00118   }
00119 
00121   virtual double scoreConstant() const {
00122     return unseenCoeff();
00123   }
00124 
00126   virtual double unseenCoeff() const =0; // a(d)
00128   virtual double seenProb(double termFreq, TERMID_T termID) const =0;
00129 
00130 protected:
00131   const UnigramLM &refLM;
00132   const double *docPrMass;
00133   SimpleKLParameter::SmoothStrategy strategy;
00134 };
00135 
00136 
00137 
00139 
00147 class JelinekMercerDocModel : public SimpleKLDocModel {
00148 public:
00149   JelinekMercerDocModel(DOCID_T docID, 
00150                         int dl,
00151                         const UnigramLM &collectLM,
00152                         const double *docProbMass,
00153                         double collectLMWeight, 
00154                         SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00155     SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00156     lambda(collectLMWeight) {
00157   };
00158 
00159   virtual ~JelinekMercerDocModel() {};
00160   
00161   virtual double unseenCoeff() const {
00162     if (strategy == SimpleKLParameter::INTERPOLATE) {
00163       return lambda;
00164     } else if (strategy==SimpleKLParameter::BACKOFF) {
00165       return lambda/(1-docPrMass[id]);
00166     } else {
00167       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00168     }
00169   }
00170   virtual double seenProb(double termFreq, TERMID_T termID) const {
00171     if (strategy == SimpleKLParameter::INTERPOLATE) {
00172       return ((1-lambda)*termFreq/(double)docLength +
00173               lambda*refLM.prob(termID));
00174     } else if (strategy == SimpleKLParameter::BACKOFF) {
00175       return ((1-lambda)*termFreq/(double)docLength);
00176     } else {
00177       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00178     }
00179   }
00180 private:
00181   double lambda;
00182 };
00183 
00185 
00190 class DirichletPriorDocModel : public SimpleKLDocModel {
00191 public:
00192   DirichletPriorDocModel(DOCID_T docID,
00193                          int dl,
00194                          const UnigramLM &collectLM,
00195                          const double *docProbMass,
00196                          double priorWordCount,
00197                          SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00198     SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00199     mu(priorWordCount) {
00200   };
00201 
00202   virtual ~DirichletPriorDocModel() {};
00203 
00204   virtual double unseenCoeff() const {
00205 
00206     if (strategy == SimpleKLParameter::INTERPOLATE) {
00207       return mu/(mu+docLength);
00208     } else if (strategy==SimpleKLParameter::BACKOFF) {
00209       return (mu/((mu+docLength)*(1-docPrMass[id])));
00210     } else {
00211       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00212     }
00213   }
00214 
00215   virtual double seenProb(double termFreq, TERMID_T termID) const {
00216     if (strategy == SimpleKLParameter::INTERPOLATE) {
00217       return (termFreq+mu*refLM.prob(termID))/
00218         (double)(docLength+mu);
00219     } else if (strategy == SimpleKLParameter::BACKOFF) {
00220       return (termFreq/(double)(docLength+mu));
00221     } else {      
00222       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00223     }
00224   }
00225 private:
00226   double mu;
00227 };
00228 
00230 
00237 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00238 public:
00239   AbsoluteDiscountDocModel(DOCID_T docID,
00240                            int dl,
00241                            const UnigramLM &collectLM,
00242                            const double *docProbMass,
00243                            COUNT_T *uniqueTermCount,
00244                            double discount,
00245                            SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00246     SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00247     uniqDocLen(uniqueTermCount),
00248     delta(discount) {
00249   };
00250 
00251   virtual ~AbsoluteDiscountDocModel() {};
00252   
00253   virtual double unseenCoeff() const {
00254 
00255     if (strategy == SimpleKLParameter::INTERPOLATE) {
00256       return (delta*uniqDocLen[id]/(double)docLength);
00257     } else if (strategy==SimpleKLParameter::BACKOFF) {
00258       return (delta*uniqDocLen[id]/(docLength*(1-docPrMass[id])));
00259     } else {
00260       throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00261     }
00262   }
00263   virtual double seenProb(double termFreq, TERMID_T termID) const {
00264     if (strategy == SimpleKLParameter::INTERPOLATE) {
00265       return ((termFreq-delta)/(double)docLength+
00266               delta*uniqDocLen[id]*refLM.prob(termID)/(double)docLength);
00267     } else if (strategy == SimpleKLParameter::BACKOFF) {
00268       return ((termFreq-delta)/(double)docLength);
00269     } else {
00270             throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00271     }
00272   }
00273 private:
00274   double *collectPr;
00275   COUNT_T *uniqDocLen;
00276   double delta;
00277 };
00278 
00279 
00281 // alpha = (mu+lambda*dLength)/(dLength+mu)
00282 // pseen(w) = [(1-lambda)*c(w;d)+ (mu+lambda*dLength)*Pc(w)]/(dLength + mu)
00283 class TwoStageDocModel : public SimpleKLDocModel {
00284 public:
00285   TwoStageDocModel(DOCID_T docID,
00286                    int dl,
00287                    const UnigramLM &collectLM,
00288                    const double *docProbMass,
00289                    double firstStageMu, 
00290                    double secondStageLambda, 
00291                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00292     SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00293     mu(firstStageMu),
00294     lambda(secondStageLambda) {
00295   };
00296 
00297   virtual ~TwoStageDocModel() {};
00298 
00299   virtual double unseenCoeff() const {
00300 
00301     if (strategy == SimpleKLParameter::INTERPOLATE) {
00302       return (mu+lambda*docLength)/(mu+docLength);
00303     } else if (strategy == SimpleKLParameter::BACKOFF) {
00304       return ((mu+lambda*docLength)/((mu+docLength)*(1-docPrMass[id])));
00305     } else {
00306             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00307     }
00308   }
00309 
00310   virtual double seenProb(double termFreq, TERMID_T termID) const {
00311     if (strategy == SimpleKLParameter::INTERPOLATE) {      
00312       return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00313               (double)(docLength+mu) + lambda*refLM.prob(termID));
00314     } else if (strategy == SimpleKLParameter::BACKOFF) {
00315       return (termFreq*(1-lambda)/(double)(docLength+mu));
00316     } else {
00317             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00318     }
00319   }
00320 private:
00321   double mu;
00322   double lambda;
00323 };
00324 
00325 #endif /* _SIMPLEKLDOCMODEL_HPP */
00326 
00327 
00328 
00329 
00330 

Generated on Wed Nov 3 12:59:03 2004 for Lemur Toolkit by doxygen1.2.18