00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _XLINGRETMETHOD_HPP
00013 #define _XLINGRETMETHOD_HPP
00014
00015 #include "common_headers.hpp"
00016 #include <cmath>
00017 #include <vector>
00018 #include <algorithm>
00019 #include "FreqVector.hpp"
00020 #include "UnigramLM.hpp"
00021 #include "ScoreFunction.hpp"
00022 #include "XLingDocModel.hpp"
00023 #include "TextQueryRep.hpp"
00024 #include "TextQueryRetMethod.hpp"
00025 #include "Counter.hpp"
00026 #include "DocUnigramCounter.hpp"
00027 #include "PDict.hpp"
00028 #include "TextHandlerManager.hpp"
00029
00030 class XLQueryTerm : public QueryTerm {
00031 public:
00032 XLQueryTerm(int tid, double wt, const char *term, double pge,
00033 PDict &dic, Stemmer *stm = NULL) :
00034 QueryTerm(tid, wt), source(term), p_s_ge(pge), dict(dic),
00035 stemmer(stm) {
00036 }
00037
00038 XLQueryTerm(const char *term, PDict &dic, Stemmer *stm = NULL) :
00039 QueryTerm(0, 0), source(term), p_s_ge(0), dict(dic), stemmer(stm) {
00040 }
00041
00042 XLQueryTerm(const XLQueryTerm &other) : QueryTerm(0,0), dict(other.dict) {
00043 ti = other.ti;
00044 w = other.w;
00045 p_s_ge = other.p_s_ge;
00046 source = other.source;
00047 stemmer = other.stemmer;
00048 }
00049
00050
00051 virtual ~XLQueryTerm() { }
00052
00054 const string &getSource() const {return source;}
00055
00057 const double getP_s_GE() const {
00058 return p_s_ge;
00059 }
00060
00062 void setWeight(double wt) {
00063 w = wt;
00064 }
00065
00067 void incWeight(double wt) {
00068 w += wt;
00069 }
00071 virtual bool operator==(const XLQueryTerm& other) const {
00072 return (other.source == source);
00073 }
00074
00076 virtual XLQueryTerm& operator=(const XLQueryTerm& other) {
00077 ti = other.ti;
00078 w = other.w;
00079 p_s_ge = other.p_s_ge;
00080 source = other.source;
00081 dict = other.dict;
00082 stemmer = other.stemmer;
00083 return (*this);
00084 }
00089 DictEntryVector *getTranslations() const {
00090 DictEntryVector *xlates = dict.getTranslations(source);
00091
00092 if (xlates == NULL && stemmer != NULL) {
00093
00094 char tmpTerm[512];
00095 strcpy(tmpTerm, source.c_str());
00096 string stem = stemmer->stemWord(tmpTerm);
00097 cerr << "getTranslations: stemming " << source << " to " << stem
00098 << endl;
00099 xlates = dict.getTranslations(stem);
00100 }
00101 return xlates;
00102 }
00103 private:
00104 string source;
00105 double p_s_ge;
00106 PDict &dict;
00107 Stemmer *stemmer;
00108 };
00109
00110
00112 class XLingQueryModel : public QueryRep {
00113 public:
00122 XLingQueryModel(const TermQuery &qry, const Index &source,
00123 bool dbS, double numSource,
00124 PDict &dict, const Stopper *stp = NULL,
00125 Stemmer *stm = NULL) {
00126
00127
00128 double pge;
00129 numTerms = 0;
00130
00131 qry.startTermIteration();
00132 while (qry.hasMore()) {
00133 const Term *t = qry.nextTerm();
00134
00135 if (stp == NULL || !(stp->stopWord(t->spelling()))) {
00136 numTerms++;
00137 XLQueryTerm st(t->spelling(), dict, stm);
00138 iter = find(qTerms.begin(), qTerms.end(), st);
00139 if (iter != qTerms.end()) {
00140
00141 (*iter).incWeight(1);
00142 } else {
00143
00144 int ti = source.term(t->spelling());
00145 if (ti>0) {
00146
00147 if (dbS) {
00148 pge = source.docCount(ti)/numSource;
00149 } else {
00150 pge = (source.termCount(ti)/numSource);
00151 }
00152 } else {
00153
00154
00155
00156 pge = (0.000001*0.000001);
00157 }
00158 XLQueryTerm newTerm(ti, 1, t->spelling(), pge, dict, stm);
00159 qTerms.push_back(newTerm);
00160 }
00161 } else {
00162 cerr << "XLingQueryModel: " << t->spelling()
00163 << " on stoplist, ignoring" << endl;
00164 }
00165
00166 }
00167 }
00168
00169 virtual ~XLingQueryModel() {
00170 }
00171
00173 virtual void startIteration() const {
00174 iter = qTerms.begin();
00175 }
00177 virtual bool hasMore() const {
00178 return (iter != qTerms.end());
00179 }
00181
00182 virtual XLQueryTerm &nextTerm() const {
00183 return (*iter++);
00184 }
00185 virtual int getNumTerms() const {return numTerms;}
00186
00187 private:
00188 mutable vector<XLQueryTerm> qTerms;
00189 mutable vector<XLQueryTerm>::iterator iter;
00190 int numTerms;
00191 };
00192
00193
00194
00195
00202
00203 class XLingRetMethod : public RetrievalMethod {
00204 public:
00205
00218 XLingRetMethod(const Index &dbIndex, const Index &background,
00219 PDict &dict, ScoreAccumulator &accumulator,
00220 double l, double b, bool cacheDR,
00221 string &sBM, string &tBM,
00222 const Stopper *stp = NULL, Stemmer *stm = NULL);
00224 virtual ~XLingRetMethod();
00225
00229 virtual DocumentRep *computeDocRep(int docID);
00230
00237 virtual double matchedTermWeight(int id, double weight,
00238 const DocInfo *info,
00239 const DocumentRep *dRep) const {
00240 double d = dRep->termWeight(id,info);
00241 double score = d * weight;
00242 return score;
00243 }
00244
00249 virtual double adjustedScore(double origScore, double pge) const {
00250 return (log((lambda * origScore) + ((1 - lambda) * pge)));
00251 }
00252
00253 virtual void scoreCollection(const QueryRep &qry,
00254 IndexedRealVector &results){
00255 scoreInvertedIndex(qry, results);
00256 }
00257
00258 virtual void scoreInvertedIndex(const QueryRep &qryRep,
00259 IndexedRealVector &scores,
00260 bool scoreAll = false);
00261
00262 virtual QueryRep *computeQueryRep(const Query &qry) {
00263 if (const TermQuery *q = dynamic_cast<const TermQuery *>(&qry))
00264 return (new XLingQueryModel(*q, source, docBasedSourceSmooth, numSource,
00265 dictionary, stopper, stemmer));
00266 else LEMUR_THROW(LEMUR_RUNTIME_ERROR, "XLingRetMethod expects query of type TermQuery");
00267 }
00268
00269 virtual QueryRep *computeTargetKLRep(const QueryRep *qry);
00270
00272 virtual double scoreDoc(const QueryRep &qry, int docID);
00273
00275 virtual void updateQuery(QueryRep &qryRep, const DocIDSet &relDocs) {}
00276
00277 protected:
00278 virtual double scoreDocVector(const XLingQueryModel &qRep, int docID,
00279 FreqVector &docVector);
00280
00281 double lambda;
00282 double beta;
00283 double numSource;
00284 double numTarget;
00285 bool docBasedSourceSmooth;
00286 bool docBasedTargetSmooth;
00287 ScoreAccumulator &scAcc;
00288 PDict &dictionary;
00289 Stemmer *stemmer;
00290 const Stopper *stopper;
00291 const Index &source;
00293 DocumentRep **docReps;
00295 bool cacheDocReps;
00297 int docRepsSize;
00298 ScoreAccumulator *termScores;
00299 };
00300
00301 #endif