Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

SimpleKLRetMethod.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 
00012 #ifndef _SIMPLEKLRETMETHOD_HPP
00013 #define _SIMPLEKLRETMETHOD_HPP
00014 
00015 #include <cmath>
00016 #include "UnigramLM.hpp"
00017 #include "ScoreFunction.hpp"
00018 #include "SimpleKLDocModel.hpp"
00019 #include "TextQueryRep.hpp"
00020 #include "TextQueryRetMethod.hpp"
00021 #include "Counter.hpp"
00022 #include "DocUnigramCounter.hpp"
00023 
00025 
00026 class SimpleKLQueryModel : public ArrayQueryRep {
00027 public:
00029   SimpleKLQueryModel(TextQuery &qry, Index &dbIndex) : 
00030     ArrayQueryRep(dbIndex.termCountUnique()+1, qry, dbIndex), qm(NULL), 
00031     ind(dbIndex), colKLComputed(false) {
00032     startIteration();
00033     colQLikelihood = 0;
00034     //Sum w in Q qtf * log(qtcf/termcount);
00035     int tc = ind.termCount();
00036     while (hasMore()) {
00037       QueryTerm *qt = nextTerm();
00038       int id = qt->id();
00039       double qtf = qt->weight();
00040       int qtcf = ind.termCount(id);
00041       double s = qtf * log((double)qtcf/(double)tc);
00042       colQLikelihood += s;
00043       delete qt;
00044     }
00045 
00046   }
00047 
00049   SimpleKLQueryModel(Index &dbIndex) : 
00050     ArrayQueryRep(dbIndex.termCountUnique()+1), qm(NULL), ind(dbIndex), 
00051     colKLComputed(false) {
00052     colQLikelihood = 0;
00053     startIteration();
00054     while (hasMore()) {
00055       QueryTerm *qt = nextTerm();
00056       setCount(qt->id(), 0);
00057       delete qt;
00058     }
00059   }
00060 
00061 
00062   virtual ~SimpleKLQueryModel(){ if (qm) delete qm;}
00063 
00064 
00066 
00073   virtual void interpolateWith(UnigramLM &qModel, double origModCoeff, 
00074                                int howManyWord, double prSumThresh=1, 
00075                                double prThresh=0);
00076   virtual double scoreConstant() {
00077     return totalCount();
00078   }
00079   
00081   virtual void load(istream &is);
00082 
00084   virtual void save(ostream &os);
00085 
00087   virtual void clarity(ostream &os);
00089   virtual double clarity();
00090 
00091 #if 0
00092 
00093   double colDivergence() {
00094     if (colKLComputed) {
00095       return colKL;
00096     } else {
00097       colKLComputed = true;
00098       double d=0;
00099       startIteration();
00100       while (hasMore()) {
00101         QueryTerm *qt=nextTerm();
00102         double pr = qt->weight()/(double)totalCount();
00103         double colPr = (ind.termCount(qt->id())+1)/(double)(ind.termCount()+ind.termCountUnique()); // Laplace smoothing, same as in SimpleKLRetMethod
00104         d += pr*log(pr/colPr);
00105         delete qt;
00106         
00107       }
00108       colKL=d;
00109       return d;
00110     }
00111   }
00112 #endif
00113 
00114   double colDivergence() {
00115     if (colKLComputed) {
00116       return colKL;
00117     } else {
00118       colKLComputed = true;
00119       double d=0;
00120       startIteration();
00121       while (hasMore()) {
00122         QueryTerm *qt=nextTerm();
00123         double pr = qt->weight()/(double)totalCount();
00124         //      double colPr = (ind.termCount(qt->id())+1)/(double)(ind.termCount()+ind.termCountUnique()); // Laplace smoothing, same as in SimpleKLRetMethod
00125         double colPr = ((double)ind.termCount(qt->id())/(double)(ind.termCount())); // ML smoothing, same as in SimpleKLRetMethod
00126         d += pr*log(pr/colPr);
00127         delete qt;
00128         
00129       }
00130       colKL=d;
00131       return d;
00132     }
00133   }
00134 
00135 
00137   double KLDivergence(UnigramLM &refMod) {
00138     double d=0;
00139     startIteration();
00140     while (hasMore()) {
00141       QueryTerm *qt=nextTerm();
00142       double pr = qt->weight()/(double)totalCount();
00143       d += pr*log(pr/refMod.prob(qt->id()));
00144       delete qt;
00145     }
00146     return d;
00147   }
00148 
00149   double colQueryLikelihood() {
00150     return colQLikelihood;
00151   }
00152   
00153 
00154 protected:
00155   // For Query likelihood adjusted score
00156   double colQLikelihood;
00157   
00158   double colKL;
00159   bool colKLComputed;
00160 
00161   IndexedRealVector *qm;
00162   Index &ind;
00163 };
00164 
00165 
00166 
00168 
00183 class SimpleKLScoreFunc : public ScoreFunction {
00184 public:
00185   enum SimpleKLParameter::adjustedScoreMethods adjScoreMethod;
00186   void setScoreMethod(enum SimpleKLParameter::adjustedScoreMethods adj) {
00187     adjScoreMethod = adj;
00188   }  
00189   virtual double matchedTermWeight(QueryTerm *qTerm, TextQueryRep *qRep, DocInfo *info, DocumentRep *dRep) { 
00190     double w = qTerm->weight();
00191     double d = dRep->termWeight(qTerm->id(),info);
00192     double l = log(d);
00193     double score = w*l;
00194     /*
00195     cerr << "M:" << qTerm->id() <<" d:" << info->docID() << " w:" << w 
00196          << " d:" << d << " l:" << l << " s:" << score << endl;
00197     */
00198     return score;
00199     //    return (qTerm->weight()*log(dRep->termWeight(qTerm->id(),info)));
00200   }
00202   virtual double adjustedScore(double origScore, TextQueryRep *qRep, 
00203                                DocumentRep *dRep) {
00204     SimpleKLQueryModel *qm = (SimpleKLQueryModel *)qRep;
00205     // dynamic_cast<SimpleKLQueryModel *>qRep;
00206     SimpleKLDocModel *dm = (SimpleKLDocModel *)dRep;
00207       // dynamic_cast<SimpleKLDocModel *>dRep;
00208 
00209     double qsc = qm->scoreConstant();
00210     double dsc = log(dm->scoreConstant());
00211     double cql = qm->colQueryLikelihood();
00212     // real query likelihood
00213     double s = dsc * qsc + origScore + cql;
00214     double qsNorm = origScore/qsc;
00215     double qmD = qm->colDivergence();
00217     switch (adjScoreMethod) {
00218     case SimpleKLParameter::QUERYLIKELIHOOD:
00220       // this is the original query likelihood scoring formula
00221       /*
00222       cerr << "A:"<< origScore << " dsc:" << dsc  << " qsc:" << qsc  
00223            << " cql:" << cql << " s:"  << s << endl;
00224       */
00225       return s;
00226       //      return (origScore+log(dm->scoreConstant())*qm->scoreConstant());
00227     case SimpleKLParameter::CROSSENTROPY:
00229       // This is the normalized query-likelihood, i.e., cross-entropy
00230       assert(qm->scoreConstant()!=0);
00231       // return (origScore/qm->scoreConstant() + log(dm->scoreConstant()));
00232       // add the term colQueryLikelihood/qm->scoreConstant
00233       s = qsNorm + dsc + cql/qsc;
00234       return (s);
00235     case SimpleKLParameter::NEGATIVEKLD:
00237       // This is the exact (negative) KL-divergence value, i.e., -D(Mq||Md)
00238       assert(qm->scoreConstant()!=0);
00239       s = qsNorm + dsc - qmD;
00240       /*
00241       cerr << origScore << ":" << qsNorm << ":" << dsc  << ":" << qmD  << ":" << s << endl;
00242       */
00243       return s;
00244       //      return (origScore/qm->scoreConstant() + log(dm->scoreConstant())
00245       //              - qm->colDivergence());
00246     }
00247   }
00248   
00249 #if 0
00250 
00251   virtual double adjustedScore(double origScore, TextQueryRep *qRep, DocumentRep *dRep) {
00252     SimpleKLQueryModel *qm = (SimpleKLQueryModel *)qRep;
00253     // dynamic_cast<SimpleKLQueryModel *>qRep;
00254     SimpleKLDocModel *dm = (SimpleKLDocModel *)dRep;
00255       // dynamic_cast<SimpleKLDocModel *>dRep;
00256 
00258 
00260     // this is the original query likelihood scoring formula
00261     //  return (origScore+log(dm->scoreConstant())*qm->scoreConstant());
00262 
00264     // This is the normalized query-likelihood, i.e., cross-entropy
00265     // assert(qm->scoreConstant()!=0);
00266     // return (origScore/qm->scoreConstant() + log(dm->scoreConstant()));
00267 
00269     // This is the exact (negative) KL-divergence value, i.e., -D(Mq||Md)
00270     assert(qm->scoreConstant()!=0);
00271     return (origScore/qm->scoreConstant() + log(dm->scoreConstant())
00272             - qm->colDivergence());
00273 
00274 
00275   }
00276 #endif
00277 };
00278 
00279 
00280 
00281 
00283 
00284 
00285 class SimpleKLRetMethod : public TextQueryRetMethod {
00286 public:
00287 
00289   SimpleKLRetMethod(Index &dbIndex, const char *supportFileName, ScoreAccumulator &accumulator);
00290   virtual ~SimpleKLRetMethod();
00291   
00292   virtual TextQueryRep *computeTextQueryRep(TextQuery &qry) {
00293     return (new SimpleKLQueryModel(qry, ind));
00294   }
00295   
00296   virtual DocumentRep *computeDocRep(int docID);
00297   
00298 
00299   virtual ScoreFunction *scoreFunc() {
00300     return (scFunc);
00301   }
00302   
00303 
00304   virtual void updateTextQuery(TextQueryRep &origRep, DocIDSet &relDocs);
00305 
00306   void setDocSmoothParam(SimpleKLParameter::DocSmoothParam &docSmthParam);
00307   void setQueryModelParam(SimpleKLParameter::QueryModelParam &queryModParam);
00308 
00309 protected:
00310 
00312   double *mcNorm; 
00313   
00315   double *docProbMass; 
00317   int *uniqueTermCount; 
00319   UnigramLM *collectLM; 
00321   DocUnigramCounter *collectLMCounter; 
00323   SimpleKLScoreFunc *scFunc; 
00324 
00326 
00327 
00328   void computeMixtureFBModel(SimpleKLQueryModel &origRep, DocIDSet & relDocs);
00330   void computeDivMinFBModel(SimpleKLQueryModel &origRep, DocIDSet &relDocs);
00332   void computeMarkovChainFBModel(SimpleKLQueryModel &origRep, DocIDSet &relDocs) ;
00334   void computeRM1FBModel(SimpleKLQueryModel &origRep, DocIDSet & relDocs);
00336   void computeRM2FBModel(SimpleKLQueryModel &origRep, DocIDSet & relDocs);
00338 
00339   SimpleKLParameter::DocSmoothParam docParam;
00340   SimpleKLParameter::QueryModelParam qryParam;
00341 
00342 };
00343 
00344 
00345 inline  void SimpleKLRetMethod::setDocSmoothParam(SimpleKLParameter::DocSmoothParam &docSmthParam)
00346 {
00347   docParam = docSmthParam;
00348 }
00349 
00350 inline  void SimpleKLRetMethod::setQueryModelParam(SimpleKLParameter::QueryModelParam &queryModParam)
00351 {
00352   qryParam = queryModParam;
00353   // add a parameter to the score function.
00354   // isn't available in the constructor.
00355   scFunc->setScoreMethod(qryParam.adjScoreMethod);
00356 }
00357 
00358 #endif /* _SIMPLEKLRETMETHOD_HPP */
00359 
00360 
00361 
00362 
00363 
00364 
00365 
00366 

Generated on Fri Feb 6 07:11:49 2004 for LEMUR by doxygen1.2.16