Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

SimpleKLDocModel.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _SIMPLEKLDOCMODEL_HPP
00014 #define _SIMPLEKLDOCMODEL_HPP
00015 
00016 #include "DocumentRep.hpp"
00017 #include "Index.hpp"
00018 #include "UnigramLM.hpp"
00019 
00020 
00022 namespace SimpleKLParameter {
00023   enum SmoothMethod  {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, TWOSTAGE=3};
00024  
00025   enum SmoothStrategy  {INTERPOLATE=0, BACKOFF=1};
00026 
00027   enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2};
00028 
00029   struct DocSmoothParam {
00031     enum SmoothMethod smthMethod;
00033     enum SmoothStrategy smthStrategy;
00035     double ADDelta;
00037     double JMLambda;
00039     double DirPrior;
00040   };
00041 
00042   static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00043   static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00044   static double defaultADDelta = 0.7;
00045   static double defaultJMLambda = 0.5;
00046   static double defaultDirPrior = 1000;
00047 
00048   struct QueryModelParam {
00050     double qryNoise;
00051 
00053     enum QueryUpdateMethod fbMethod;
00055     double fbCoeff;
00057     int fbTermCount;
00059     double fbPrTh;
00061     double fbPrSumTh;
00063     double fbMixtureNoise;
00065     int emIterations;
00066   };
00067 
00068   static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00069   static double defaultFBCoeff = 0.5;
00070   static int defaultFBTermCount =50;
00071   static double defaultFBPrTh = 0.001;
00072   static double defaultFBPrSumTh = 1;
00073   static double defaultFBMixNoise = 0.5;
00074   static int defaultEMIterations = 50;
00075   static double defaultQryNoise = 0; //maximum likelihood estimator
00076 };
00077 
00078 
00080 
00093 class SimpleKLDocModel : public DocumentRep {
00094 public:
00095   SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00096   ~SimpleKLDocModel() {};
00097 
00099   virtual double termWeight(int termID, DocInfo *info) {
00100     return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00101   }
00102 
00104   virtual double scoreConstant() {
00105     return unseenCoeff();
00106   }
00107 
00109   virtual double unseenCoeff()=0; // a(d)
00111   virtual double seenProb(double termFreq, int termID)=0; // p(w|d), w seen
00112 protected:
00113   UnigramLM &refLM;
00114 };
00115 
00116 
00117 
00119 
00127 class JelinekMercerDocModel : public SimpleKLDocModel {
00128 public:
00129   JelinekMercerDocModel(int docID, 
00130                         Index *referenceIndex, 
00131                         UnigramLM &collectLM,
00132                         double *docProbMass,
00133                         double collectLMWeight, 
00134                         SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00135     SimpleKLDocModel(docID, collectLM),
00136     refIndex(referenceIndex),
00137     docPrMass(docProbMass),
00138     lambda(collectLMWeight), 
00139     strategy(smthStrategy) {
00140   };
00141 
00142   virtual ~JelinekMercerDocModel() {};
00143   
00144   virtual double unseenCoeff() {
00145     if (strategy == SimpleKLParameter::INTERPOLATE) {
00146       return lambda;
00147     } else if (strategy==SimpleKLParameter::BACKOFF) {
00148       return lambda/(1-docPrMass[id]);
00149     } else {
00150       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00151     }
00152   }
00153   virtual double seenProb(double termFreq, int termID) {
00154     if (strategy == SimpleKLParameter::INTERPOLATE) {
00155       return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00156               lambda*refLM.prob(termID));
00157     } else if (strategy == SimpleKLParameter::BACKOFF) {
00158       return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00159     } else {
00160       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00161     }
00162   }
00163 private:
00164   Index *refIndex;
00165   double *docPrMass;
00166   double lambda;
00167   SimpleKLParameter::SmoothStrategy strategy;
00168 };
00169 
00171 
00176 class DirichletPriorDocModel : public SimpleKLDocModel {
00177 public:
00178   DirichletPriorDocModel(int docID,
00179                    Index *referenceIndex, 
00180                    UnigramLM &collectLM,
00181                    double *docProbMass,
00182                    double priorWordCount, 
00183                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00184     SimpleKLDocModel(docID, collectLM),
00185     refIndex(referenceIndex),
00186     docPrMass(docProbMass),
00187     mu(priorWordCount),
00188             strategy(smthStrategy) {
00189   };
00190 
00191   virtual ~DirichletPriorDocModel() {};
00192 
00193   virtual double unseenCoeff() {
00194 
00195     if (strategy == SimpleKLParameter::INTERPOLATE) {
00196       return mu/(mu+refIndex->docLength(id));
00197     } else if (strategy==SimpleKLParameter::BACKOFF) {
00198       return (mu/((mu+refIndex->docLength(id))*
00199                       (1-docPrMass[id])));
00200     } else {
00201       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00202     }
00203   }
00204 
00205   virtual double seenProb(double termFreq, int termID) {
00206     if (strategy == SimpleKLParameter::INTERPOLATE) {
00207       return (termFreq+mu*refLM.prob(termID))/
00208         (double)(refIndex->docLength(id)+mu);
00209     } else if (strategy == SimpleKLParameter::BACKOFF) {
00210       return (termFreq/
00211               (double)(refIndex->docLength(id)+mu));
00212     } else {      
00213       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00214     }
00215   }
00216 private:
00217   Index *refIndex;
00218   double *docPrMass;
00219   double mu;
00220   SimpleKLParameter::SmoothStrategy strategy;
00221 };
00222 
00224 
00231 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00232 public:
00233   AbsoluteDiscountDocModel(int docID,
00234                            Index *referenceIndex, 
00235                            UnigramLM &collectLM,
00236                            double *docProbMass,
00237                            int *uniqueTermCount,
00238                            double discount,
00239                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00240     SimpleKLDocModel(docID, collectLM),
00241     refIndex(referenceIndex),
00242     docPrMass(docProbMass),
00243     uniqDocLen(uniqueTermCount),
00244     delta(discount),
00245       strategy(smthStrategy) {
00246   };
00247 
00248   virtual ~AbsoluteDiscountDocModel() {};
00249   
00250   virtual double unseenCoeff() {
00251 
00252     if (strategy == SimpleKLParameter::INTERPOLATE) {
00253       return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00254     } else if (strategy==SimpleKLParameter::BACKOFF) {
00255       return (delta*uniqDocLen[id]/
00256               (refIndex->docLength(id)*(1-docPrMass[id])));
00257     } else {
00258       throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00259     }
00260   }
00261   virtual double seenProb(double termFreq, int termID) {
00262     if (strategy == SimpleKLParameter::INTERPOLATE) {
00263       return ((termFreq-delta)/(double)refIndex->docLength(id)+
00264               delta*uniqDocLen[id]*refLM.prob(termID)/
00265               (double)refIndex->docLength(id));
00266     } else if (strategy == SimpleKLParameter::BACKOFF) {
00267       return ((termFreq-delta)/(double)refIndex->docLength(id));
00268     } else {
00269             throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00270     }
00271   }
00272 private:
00273   Index *refIndex;
00274   double *collectPr;
00275   double *docPrMass;
00276   int *uniqDocLen;
00277   double delta;
00278   SimpleKLParameter::SmoothStrategy strategy;
00279 };
00280 
00281 
00282 
00284 // alpha = (mu+lambda*dLength)/(dLength+mu)
00285 // pseen(w) = [(1-lambda)*c(w;d)+ (mu+lambda*dLength)*Pc(w)]/(dLength + mu)
00286 class TwoStageDocModel : public SimpleKLDocModel {
00287 public:
00288   TwoStageDocModel(int docID,
00289                    Index *referenceIndex, 
00290                    UnigramLM &collectLM,
00291                    double *docProbMass,
00292                    double firstStageMu, 
00293                    double secondStageLambda, 
00294                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00295     SimpleKLDocModel(docID, collectLM),
00296     refIndex(referenceIndex),
00297     docPrMass(docProbMass),
00298     mu(firstStageMu),
00299     lambda(secondStageLambda),
00300       strategy(smthStrategy) {
00301   };
00302 
00303   virtual ~TwoStageDocModel() {};
00304 
00305   virtual double unseenCoeff() {
00306 
00307     if (strategy == SimpleKLParameter::INTERPOLATE) {
00308       return (mu+lambda*refIndex->docLength(id))/(mu+refIndex->docLength(id));
00309     } else if (strategy == SimpleKLParameter::BACKOFF) {
00310       return ((mu+lambda*refIndex->docLength(id))
00311               /((mu+refIndex->docLength(id))*
00312                 (1-docPrMass[id])));
00313     } else {
00314             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00315     }
00316   }
00317 
00318   virtual double seenProb(double termFreq, int termID) {
00319     if (strategy == SimpleKLParameter::INTERPOLATE) {      
00320       return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00321               (double)(refIndex->docLength(id)+mu) 
00322               + lambda*refLM.prob(termID));
00323     } else if (strategy == SimpleKLParameter::BACKOFF) {
00324       return (termFreq*(1-lambda)/
00325               (double)(refIndex->docLength(id)+mu));
00326     } else {
00327             throw Exception("TwoStageDocModel", "Unknown smoothing strategy");
00328     }
00329   }
00330 private:
00331   Index *refIndex;
00332   double *docPrMass;
00333   double mu;
00334   double lambda;
00335   SimpleKLParameter::SmoothStrategy strategy;
00336 };
00337 
00338 #endif /* _SIMPLEKLDOCMODEL_HPP */
00339 
00340 
00341 
00342 
00343 

Generated at Fri Jul 26 18:26:25 2002 for LEMUR by doxygen1.2.4 written by Dimitri van Heesch, © 1997-2000