00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _SIMPLEKLRETMETHOD_HPP
00013 #define _SIMPLEKLRETMETHOD_HPP
00014
00015 #include <cmath>
00016 #include "UnigramLM.hpp"
00017 #include "ScoreFunction.hpp"
00018 #include "SimpleKLDocModel.hpp"
00019 #include "TextQueryRep.hpp"
00020 #include "TextQueryRetMethod.hpp"
00021 #include "Counter.hpp"
00022 #include "DocUnigramCounter.hpp"
00023
00025
00026 class SimpleKLQueryModel : public ArrayQueryRep {
00027 public:
00029 SimpleKLQueryModel(TextQuery &qry, Index &dbIndex) :
00030 ArrayQueryRep(dbIndex.termCountUnique()+1, qry, dbIndex), qm(NULL),
00031 ind(dbIndex), colKLComputed(false) {
00032 startIteration();
00033 colQLikelihood = 0;
00034
00035 int tc = ind.termCount();
00036 while (hasMore()) {
00037 QueryTerm *qt = nextTerm();
00038 int id = qt->id();
00039 double qtf = qt->weight();
00040 int qtcf = ind.termCount(id);
00041 double s = qtf * log((double)qtcf/(double)tc);
00042 colQLikelihood += s;
00043 delete qt;
00044 }
00045
00046 }
00047
00049 SimpleKLQueryModel(Index &dbIndex) :
00050 ArrayQueryRep(dbIndex.termCountUnique()+1), qm(NULL), ind(dbIndex),
00051 colKLComputed(false) {
00052 colQLikelihood = 0;
00053 startIteration();
00054 while (hasMore()) {
00055 QueryTerm *qt = nextTerm();
00056 setCount(qt->id(), 0);
00057 delete qt;
00058 }
00059 }
00060
00061
00062 virtual ~SimpleKLQueryModel(){ if (qm) delete qm;}
00063
00064
00066
00073 virtual void interpolateWith(UnigramLM &qModel, double origModCoeff,
00074 int howManyWord, double prSumThresh=1,
00075 double prThresh=0);
00076 virtual double scoreConstant() {
00077 return totalCount();
00078 }
00079
00081 virtual void load(istream &is);
00082
00084 virtual void save(ostream &os);
00085
00087 virtual void clarity(ostream &os);
00089 virtual double clarity();
00090
00091 #if 0
00092
00093 double colDivergence() {
00094 if (colKLComputed) {
00095 return colKL;
00096 } else {
00097 colKLComputed = true;
00098 double d=0;
00099 startIteration();
00100 while (hasMore()) {
00101 QueryTerm *qt=nextTerm();
00102 double pr = qt->weight()/(double)totalCount();
00103 double colPr = (ind.termCount(qt->id())+1)/(double)(ind.termCount()+ind.termCountUnique());
00104 d += pr*log(pr/colPr);
00105 delete qt;
00106
00107 }
00108 colKL=d;
00109 return d;
00110 }
00111 }
00112 #endif
00113
00114 double colDivergence() {
00115 if (colKLComputed) {
00116 return colKL;
00117 } else {
00118 colKLComputed = true;
00119 double d=0;
00120 startIteration();
00121 while (hasMore()) {
00122 QueryTerm *qt=nextTerm();
00123 double pr = qt->weight()/(double)totalCount();
00124
00125 double colPr = ((double)ind.termCount(qt->id())/(double)(ind.termCount()));
00126 d += pr*log(pr/colPr);
00127 delete qt;
00128
00129 }
00130 colKL=d;
00131 return d;
00132 }
00133 }
00134
00135
00137 double KLDivergence(UnigramLM &refMod) {
00138 double d=0;
00139 startIteration();
00140 while (hasMore()) {
00141 QueryTerm *qt=nextTerm();
00142 double pr = qt->weight()/(double)totalCount();
00143 d += pr*log(pr/refMod.prob(qt->id()));
00144 delete qt;
00145 }
00146 return d;
00147 }
00148
00149 double colQueryLikelihood() {
00150 return colQLikelihood;
00151 }
00152
00153
00154 protected:
00155
00156 double colQLikelihood;
00157
00158 double colKL;
00159 bool colKLComputed;
00160
00161 IndexedRealVector *qm;
00162 Index &ind;
00163 };
00164
00165
00166
00168
00183 class SimpleKLScoreFunc : public ScoreFunction {
00184 public:
00185 enum SimpleKLParameter::adjustedScoreMethods adjScoreMethod;
00186 void setScoreMethod(enum SimpleKLParameter::adjustedScoreMethods adj) {
00187 adjScoreMethod = adj;
00188 }
00189 virtual double matchedTermWeight(QueryTerm *qTerm, TextQueryRep *qRep, DocInfo *info, DocumentRep *dRep) {
00190 double w = qTerm->weight();
00191 double d = dRep->termWeight(qTerm->id(),info);
00192 double l = log(d);
00193 double score = w*l;
00194
00195
00196
00197
00198 return score;
00199
00200 }
00202 virtual double adjustedScore(double origScore, TextQueryRep *qRep,
00203 DocumentRep *dRep) {
00204 SimpleKLQueryModel *qm = (SimpleKLQueryModel *)qRep;
00205
00206 SimpleKLDocModel *dm = (SimpleKLDocModel *)dRep;
00207
00208
00209 double qsc = qm->scoreConstant();
00210 double dsc = log(dm->scoreConstant());
00211 double cql = qm->colQueryLikelihood();
00212
00213 double s = dsc * qsc + origScore + cql;
00214 double qsNorm = origScore/qsc;
00215 double qmD = qm->colDivergence();
00217 switch (adjScoreMethod) {
00218 case SimpleKLParameter::QUERYLIKELIHOOD:
00220
00221
00222
00223
00224
00225 return s;
00226
00227 case SimpleKLParameter::CROSSENTROPY:
00229
00230 assert(qm->scoreConstant()!=0);
00231
00232
00233 s = qsNorm + dsc + cql/qsc;
00234 return (s);
00235 case SimpleKLParameter::NEGATIVEKLD:
00237
00238 assert(qm->scoreConstant()!=0);
00239 s = qsNorm + dsc - qmD;
00240
00241
00242
00243 return s;
00244
00245
00246 }
00247 }
00248
00249 #if 0
00250
00251 virtual double adjustedScore(double origScore, TextQueryRep *qRep, DocumentRep *dRep) {
00252 SimpleKLQueryModel *qm = (SimpleKLQueryModel *)qRep;
00253
00254 SimpleKLDocModel *dm = (SimpleKLDocModel *)dRep;
00255
00256
00258
00260
00261
00262
00264
00265
00266
00267
00269
00270 assert(qm->scoreConstant()!=0);
00271 return (origScore/qm->scoreConstant() + log(dm->scoreConstant())
00272 - qm->colDivergence());
00273
00274
00275 }
00276 #endif
00277 };
00278
00279
00280
00281
00283
00284
00285 class SimpleKLRetMethod : public TextQueryRetMethod {
00286 public:
00287
00289 SimpleKLRetMethod(Index &dbIndex, const char *supportFileName, ScoreAccumulator &accumulator);
00290 virtual ~SimpleKLRetMethod();
00291
00292 virtual TextQueryRep *computeTextQueryRep(TextQuery &qry) {
00293 return (new SimpleKLQueryModel(qry, ind));
00294 }
00295
00296 virtual DocumentRep *computeDocRep(int docID);
00297
00298
00299 virtual ScoreFunction *scoreFunc() {
00300 return (scFunc);
00301 }
00302
00303
00304 virtual void updateTextQuery(TextQueryRep &origRep, DocIDSet &relDocs);
00305
00306 void setDocSmoothParam(SimpleKLParameter::DocSmoothParam &docSmthParam);
00307 void setQueryModelParam(SimpleKLParameter::QueryModelParam &queryModParam);
00308
00309 protected:
00310
00312 double *mcNorm;
00313
00315 double *docProbMass;
00317 int *uniqueTermCount;
00319 UnigramLM *collectLM;
00321 DocUnigramCounter *collectLMCounter;
00323 SimpleKLScoreFunc *scFunc;
00324
00326
00327
00328 void computeMixtureFBModel(SimpleKLQueryModel &origRep, DocIDSet & relDocs);
00330 void computeDivMinFBModel(SimpleKLQueryModel &origRep, DocIDSet &relDocs);
00332 void computeMarkovChainFBModel(SimpleKLQueryModel &origRep, DocIDSet &relDocs) ;
00334 void computeRM1FBModel(SimpleKLQueryModel &origRep, DocIDSet & relDocs);
00336 void computeRM2FBModel(SimpleKLQueryModel &origRep, DocIDSet & relDocs);
00338
00339 SimpleKLParameter::DocSmoothParam docParam;
00340 SimpleKLParameter::QueryModelParam qryParam;
00341
00342 };
00343
00344
00345 inline void SimpleKLRetMethod::setDocSmoothParam(SimpleKLParameter::DocSmoothParam &docSmthParam)
00346 {
00347 docParam = docSmthParam;
00348 }
00349
00350 inline void SimpleKLRetMethod::setQueryModelParam(SimpleKLParameter::QueryModelParam &queryModParam)
00351 {
00352 qryParam = queryModParam;
00353
00354
00355 scFunc->setScoreMethod(qryParam.adjScoreMethod);
00356 }
00357
00358 #endif
00359
00360
00361
00362
00363
00364
00365
00366