Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

XLingRetMethod.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 
00012 #ifndef _XLINGRETMETHOD_HPP
00013 #define _XLINGRETMETHOD_HPP
00014 
00015 #include "common_headers.hpp"
00016 #include <cmath>
00017 #include <vector>
00018 #include <algorithm>
00019 #include "FreqVector.hpp"
00020 #include "UnigramLM.hpp"
00021 #include "ScoreFunction.hpp"
00022 #include "XLingDocModel.hpp"
00023 #include "TextQueryRep.hpp"
00024 #include "TextQueryRetMethod.hpp"
00025 #include "Counter.hpp"
00026 #include "DocUnigramCounter.hpp"
00027 #include "PDict.hpp"
00028 #include "TextHandlerManager.hpp"
00029 
00030 class XLQueryTerm : public QueryTerm {
00031 public:
00032   XLQueryTerm(int tid, double  wt, const char *term, double pge,
00033               PDict &dic, Stemmer *stm = NULL) :
00034     QueryTerm(tid, wt), source(term), p_s_ge(pge), dict(dic),
00035     stemmer(stm) {
00036   }
00037 
00038   XLQueryTerm(const char *term, PDict &dic, Stemmer *stm = NULL) : 
00039     QueryTerm(0, 0), source(term), p_s_ge(0), dict(dic), stemmer(stm) {
00040   }
00041 
00042   XLQueryTerm(const XLQueryTerm &other) : QueryTerm(0,0), dict(other.dict) {
00043     ti = other.ti;
00044     w = other.w;
00045     p_s_ge = other.p_s_ge;
00046     source = other.source;
00047     stemmer = other.stemmer;
00048   }
00049 
00050 
00051   virtual ~XLQueryTerm() { }
00052 
00054   const string &getSource() const {return source;}
00055 
00057   const double getP_s_GE() const {
00058     return p_s_ge;
00059   }
00060   
00062   void setWeight(double wt) {
00063     w = wt;
00064   }
00065 
00067   void incWeight(double wt) {
00068     w += wt;
00069   }
00071   virtual bool operator==(const XLQueryTerm& other) const {
00072     return (other.source == source);
00073   }
00074 
00076   virtual XLQueryTerm& operator=(const XLQueryTerm& other)  {
00077     ti = other.ti;
00078     w = other.w;
00079     p_s_ge = other.p_s_ge;
00080     source = other.source;
00081     dict = other.dict;
00082     stemmer = other.stemmer;
00083     return (*this);
00084   }
00089   DictEntryVector *getTranslations() const {
00090     DictEntryVector *xlates = dict.getTranslations(source);
00091     // If no xlates, Leah's version stems the term and tries again.
00092     if (xlates == NULL && stemmer != NULL) {
00093       // porter stemmer is destructive
00094       char tmpTerm[512];
00095       strcpy(tmpTerm, source.c_str());
00096       string stem = stemmer->stemWord(tmpTerm);
00097       cerr << "getTranslations: stemming " << source << " to " << stem 
00098            << endl;
00099       xlates = dict.getTranslations(stem);
00100     }
00101     return xlates;
00102   }
00103 private:
00104   string source;
00105   double p_s_ge;
00106   PDict &dict;  
00107   Stemmer *stemmer;
00108 };
00109 
00110 
00112 class XLingQueryModel : public QueryRep {
00113 public:
00122   XLingQueryModel(const TermQuery &qry, const Index &source, 
00123                   bool dbS, double numSource,
00124                   PDict &dict, const Stopper *stp = NULL, 
00125                   Stemmer *stm = NULL) {
00126     // fill in weighted terms
00127     // P(e|GE)
00128     double pge;
00129     numTerms = 0;
00130     
00131     qry.startTermIteration();
00132     while (qry.hasMore()) {
00133       const Term *t = qry.nextTerm();
00134       // if Stopper is not NULL, test for stopwords.
00135       if (stp == NULL || !(stp->stopWord(t->spelling()))) {
00136         numTerms++;
00137         XLQueryTerm st(t->spelling(), dict, stm);
00138         iter = find(qTerms.begin(), qTerms.end(), st);
00139         if (iter != qTerms.end()) {
00140           // found it, bump count
00141           (*iter).incWeight(1);
00142         } else {
00143           // new term
00144           int ti = source.term(t->spelling());
00145           if (ti>0) {
00146             // pge
00147             if (dbS) {
00148               pge = source.docCount(ti)/numSource;
00149             } else {
00150               pge = (source.termCount(ti)/numSource);      
00151             }
00152           } else {
00153             // OOV, use default pge
00154             // perhaps this would be better estimated with:
00155             //    pge = 1/(numSource + 1);
00156             pge = (0.000001*0.000001);
00157           }
00158           XLQueryTerm newTerm(ti, 1, t->spelling(), pge, dict, stm);
00159           qTerms.push_back(newTerm);
00160         }
00161       } else {
00162         cerr << "XLingQueryModel: " << t->spelling() 
00163              << " on stoplist, ignoring" << endl;
00164       }
00165       
00166     }
00167   }
00168   
00169   virtual ~XLingQueryModel() {
00170   }
00171 
00173   virtual void startIteration() const {
00174     iter = qTerms.begin();
00175   }
00177   virtual bool hasMore() const {
00178     return (iter != qTerms.end());
00179   }
00181   //  virtual XLQueryTerm &nextTerm() {
00182   virtual XLQueryTerm &nextTerm() const {
00183     return (*iter++);
00184   }
00185   virtual int getNumTerms() const {return numTerms;}
00186   
00187 private:
00188   mutable vector<XLQueryTerm> qTerms;
00189   mutable vector<XLQueryTerm>::iterator iter;
00190   int numTerms;
00191 };
00192 
00193 // Should not really be a TextQueryRetMethod, as it does not score
00194 // in a like fashion. but does take advantage of the cached doc reps.
00195 //
00202 //class XLingRetMethod : public TextQueryRetMethod {
00203 class XLingRetMethod : public RetrievalMethod {
00204 public:
00205 
00218   XLingRetMethod(const Index &dbIndex, const Index &background, 
00219                  PDict &dict, ScoreAccumulator &accumulator, 
00220                  double l, double b, bool cacheDR,
00221                  string &sBM, string &tBM, 
00222                  const Stopper *stp = NULL, Stemmer *stm = NULL);
00224   virtual ~XLingRetMethod();
00225   
00229   virtual DocumentRep *computeDocRep(int docID);
00230 
00237   virtual double matchedTermWeight(int id, double weight,
00238                                    const DocInfo *info, 
00239                                    const DocumentRep *dRep) const { 
00240     double d = dRep->termWeight(id,info); //P(a|D)
00241     double score = d * weight; //P(a|D) * P(e|a)
00242     return score;
00243   }
00244 
00249   virtual double adjustedScore(double origScore, double pge) const {
00250     return (log((lambda * origScore) + ((1 - lambda) * pge)));
00251   }
00252 
00253   virtual void scoreCollection(const QueryRep &qry, 
00254                                IndexedRealVector &results){
00255     scoreInvertedIndex(qry, results);
00256   }
00257   // Override (have to do individual doc ones too.
00258   virtual void scoreInvertedIndex(const QueryRep &qryRep, 
00259                                   IndexedRealVector &scores, 
00260                                   bool scoreAll = false);
00261 
00262   virtual QueryRep *computeQueryRep(const Query &qry) {
00263     if (const TermQuery *q = dynamic_cast<const TermQuery *>(&qry))
00264       return (new XLingQueryModel(*q, source, docBasedSourceSmooth, numSource,
00265                                   dictionary, stopper, stemmer));
00266     else LEMUR_THROW(LEMUR_RUNTIME_ERROR, "XLingRetMethod expects query of type TermQuery");
00267   } 
00268 
00269   virtual QueryRep *computeTargetKLRep(const QueryRep *qry);
00270 
00272   virtual double scoreDoc(const QueryRep &qry, int docID);
00273 
00275   virtual void updateQuery(QueryRep &qryRep, const DocIDSet &relDocs) {}
00276 
00277 protected:
00278   virtual double scoreDocVector(const XLingQueryModel &qRep, int docID, 
00279                                 FreqVector &docVector);
00280 
00281   double lambda;
00282   double beta;
00283   double numSource;
00284   double numTarget;
00285   bool docBasedSourceSmooth;
00286   bool docBasedTargetSmooth;
00287   ScoreAccumulator &scAcc; // this does not need to be passed in. Bleah.
00288   PDict &dictionary;
00289   Stemmer *stemmer; // source language
00290   const Stopper *stopper; // source language
00291   const Index &source;
00293   DocumentRep **docReps;
00295   bool cacheDocReps;
00297   int docRepsSize;
00298   ScoreAccumulator *termScores;
00299 };
00300 
00301 #endif /* _XLINGRETMETHOD_HPP */

Generated on Fri Jul 2 16:25:37 2004 for Lemur Toolkit by doxygen1.2.18