00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 #ifndef _SIMPLEKLDOCMODEL_HPP
00014 #define _SIMPLEKLDOCMODEL_HPP
00015 
00016 #include "DocumentRep.hpp"
00017 #include "Index.hpp"
00018 #include "UnigramLM.hpp"
00019 
00020 
00022 namespace SimpleKLParameter {
00023   enum SmoothMethod  {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2};
00024  
00025   enum SmoothStrategy  {INTERPOLATE=0, BACKOFF=1};
00026 
00027   enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2};
00028 
00029   struct DocSmoothParam {
00031     enum SmoothMethod smthMethod;
00033     enum SmoothStrategy smthStrategy;
00035     double ADDelta;
00037     double JMLambda;
00039     double DirPrior;
00040   };
00041 
00042   static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00043   static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00044   static double defaultADDelta = 0.7;
00045   static double defaultJMLambda = 0.5;
00046   static double defaultDirPrior = 1000;
00047 
00048   struct QueryModelParam {
00050     enum QueryUpdateMethod fbMethod;
00052     double fbCoeff;
00054     int fbTermCount;
00056     double fbPrTh;
00058     double fbPrSumTh;
00060     double fbMixtureNoise;
00062     int emIterations;
00063   };
00064 
00065   static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00066   static double defaultFBCoeff = 0.5;
00067   static int defaultFBTermCount =50;
00068   static double defaultFBPrTh = 0.001;
00069   static double defaultFBPrSumTh = 1;
00070   static double defaultFBMixNoise = 0.5;
00071   static int defaultEMIterations = 50;
00072 
00073 };
00074 
00075 
00077 
00090 class SimpleKLDocModel : public DocumentRep {
00091 public:
00092   SimpleKLDocModel(int docID, UnigramLM &collectLM) : DocumentRep(docID), refLM(collectLM) {}
00093   ~SimpleKLDocModel() {};
00094 
00096   virtual double termWeight(int termID, DocInfo *info) {
00097     return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00098   }
00099 
00101   virtual double scoreConstant() {
00102     return unseenCoeff();
00103   }
00104 
00106   virtual double unseenCoeff()=0; 
00108 
00109 protected:
00110   UnigramLM &refLM;
00111 };
00112 
00113 
00114 
00116 
00124 class JelinekMercerDocModel : public SimpleKLDocModel {
00125 public:
00126   JelinekMercerDocModel(int docID, 
00127                         Index *referenceIndex, 
00128                         UnigramLM &collectLM,
00129                         double *docProbMass,
00130                         double collectLMWeight, 
00131                         SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00132     SimpleKLDocModel(docID, collectLM),
00133     refIndex(referenceIndex),
00134     docPrMass(docProbMass),
00135     lambda(collectLMWeight), 
00136     strategy(smthStrategy) {
00137   };
00138 
00139   virtual ~JelinekMercerDocModel() {};
00140   
00141   virtual double unseenCoeff() {
00142     if (strategy == SimpleKLParameter::INTERPOLATE) {
00143       return lambda;
00144     } else if (strategy==SimpleKLParameter::BACKOFF) {
00145       return lambda/(1-docPrMass[id]);
00146     } else {
00147       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00148     }
00149   }
00150   virtual double seenProb(double termFreq, int termID) {
00151     if (strategy == SimpleKLParameter::INTERPOLATE) {
00152       return ((1-lambda)*termFreq/(double)refIndex->docLength(id)+
00153               lambda*refLM.prob(termID));
00154     } else if (strategy == SimpleKLParameter::BACKOFF) {
00155       return ((1-lambda)*termFreq/(double)refIndex->docLength(id));
00156     } else {
00157       throw Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00158     }
00159   }
00160 private:
00161   Index *refIndex;
00162   double *docPrMass;
00163   double lambda;
00164   SimpleKLParameter::SmoothStrategy strategy;
00165 };
00166 
00168 
00173 class DirichletPriorDocModel : public SimpleKLDocModel {
00174 public:
00175   DirichletPriorDocModel(int docID,
00176                    Index *referenceIndex, 
00177                    UnigramLM &collectLM,
00178                    double *docProbMass,
00179                    double priorWordCount, 
00180                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00181     SimpleKLDocModel(docID, collectLM),
00182     refIndex(referenceIndex),
00183     docPrMass(docProbMass),
00184     mu(priorWordCount),
00185             strategy(smthStrategy) {
00186   };
00187 
00188   virtual ~DirichletPriorDocModel() {};
00189 
00190   virtual double unseenCoeff() {
00191 
00192     if (strategy == SimpleKLParameter::INTERPOLATE) {
00193       return mu/(mu+refIndex->docLength(id));
00194     } else if (strategy==SimpleKLParameter::BACKOFF) {
00195       return (mu/((mu+refIndex->docLength(id))*
00196                       (1-docPrMass[id])));
00197     } else {
00198       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00199     }
00200   }
00201 
00202   virtual double seenProb(double termFreq, int termID) {
00203     if (strategy == SimpleKLParameter::INTERPOLATE) {
00204       return (termFreq+mu*refLM.prob(termID))/
00205         (double)(refIndex->docLength(id)+mu);
00206     } else if (strategy == SimpleKLParameter::BACKOFF) {
00207       return (termFreq/
00208               (double)(refIndex->docLength(id)+mu));
00209     } else {      
00210       throw Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00211     }
00212   }
00213 private:
00214   Index *refIndex;
00215   double *docPrMass;
00216   double mu;
00217   SimpleKLParameter::SmoothStrategy strategy;
00218 };
00219 
00221 
00228 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00229 public:
00230   AbsoluteDiscountDocModel(int docID,
00231                            Index *referenceIndex, 
00232                            UnigramLM &collectLM,
00233                            double *docProbMass,
00234                            int *uniqueTermCount,
00235                            double discount,
00236                 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00237     SimpleKLDocModel(docID, collectLM),
00238     refIndex(referenceIndex),
00239     docPrMass(docProbMass),
00240     uniqDocLen(uniqueTermCount),
00241     delta(discount),
00242       strategy(smthStrategy) {
00243   };
00244 
00245   virtual ~AbsoluteDiscountDocModel() {};
00246   
00247   virtual double unseenCoeff() {
00248 
00249     if (strategy == SimpleKLParameter::INTERPOLATE) {
00250       return (delta*uniqDocLen[id]/(double)refIndex->docLength(id));
00251     } else if (strategy==SimpleKLParameter::BACKOFF) {
00252       return (delta*uniqDocLen[id]/
00253               (refIndex->docLength(id)*(1-docPrMass[id])));
00254     } else {
00255       throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00256     }
00257   }
00258   virtual double seenProb(double termFreq, int termID) {
00259     if (strategy == SimpleKLParameter::INTERPOLATE) {
00260       return ((termFreq-delta)/(double)refIndex->docLength(id)+
00261               delta*uniqDocLen[id]*refLM.prob(termID)/
00262               (double)refIndex->docLength(id));
00263     } else if (strategy == SimpleKLParameter::BACKOFF) {
00264       return ((termFreq-delta)/(double)refIndex->docLength(id));
00265     } else {
00266             throw Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00267     }
00268   }
00269 private:
00270   Index *refIndex;
00271   double *collectPr;
00272   double *docPrMass;
00273   int *uniqDocLen;
00274   double delta;
00275   SimpleKLParameter::SmoothStrategy strategy;
00276 };
00277 
00278 
00279 #endif 
00280 
00281 
00282 
00283 
00284