00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <iomanip>
00013 #include "Summarizer.hpp"
00014 #include "Passage.hpp"
00015 #include "BasicPassage.hpp"
00016 #include "InvFPIndex.hpp"
00017 #include <algorithm>
00018 #include <vector>
00019
00020 using std::vector;
00021
00022 #if (defined(WIN32) && !defined(min))
00023 #define min(x,y) __min(x,y)
00024 #endif
00025
00026 #ifndef _BASICSUMM_HPP
00027 #define _BASICSUMM_HPP
00028
00029 #define EOS "*eos"
00030 #define PSG_LEN 15
00031
00036 class BasicSumm : public Summarizer {
00037
00038 private:
00039 InvFPIndex* idx;
00040 int summLen;
00041 vector<BasicPassage> doc;
00042 int iterCount;
00043
00044 public:
00046 BasicSumm(InvFPIndex* inIdx, int inSummLen = 5) {
00047 idx = inIdx;
00048 summLen = inSummLen;
00049 iterCount = 1;
00050 };
00051
00052 virtual void summDocument(const char* docID, const int optLen, const char* qInfo);
00053
00054 virtual void scorePassages(const char* qInfo);
00055
00056 virtual void markPassages(int optLen, char* qInfo);
00057
00058 virtual void addPassage(Passage &psg);
00059
00060 virtual void clear(void);
00061
00062 virtual int fetchPassages(Passage* psgs, int optLen);
00063
00064 virtual int nextPassage(Passage* psg);
00065
00066 virtual void iterClear(void);
00067
00068 virtual void outputSumm(void);
00069
00071 int isEOS(const char* check) {
00072 return !strcmp(check, EOS);
00073 }
00074
00076 int hasEOS(InvFPIndex* idx, TermInfoList* tList) {
00077 tList->startIteration();
00078 TermInfo* tEntry;
00079 while (tList->hasMore()) {
00080 tEntry = tList->nextEntry();
00081 if ( isEOS(idx->term(tEntry->id())) ) return true;
00082 }
00083 return false;
00084 }
00085
00087 double scorePassage(BasicPassage &psg, char* qInfo) {
00088 char* docID = psg.docID;
00089 passageVec psgV= *psg.getAsVector();
00090 double psgLen = psgV.size();
00091 double P = 1;
00092 double M = 1.5;
00093 double endScore, Tf, tf, idf, docLen, avgDocLen;
00094 endScore = 0.0;
00095 for (int i=0; i < psgLen; i++) {
00096 docLen = idx->docLength(idx->document(docID));
00097 avgDocLen = idx->docLengthAvg();
00098 tf = psgV[i].tf;
00099 Tf = tf / (tf + 0.5 + 1.5 * (docLen/avgDocLen) );
00100 idf = min(M, log((double)idx->docCount()/(double)idx->docCount(psgV[i].termID)));
00101 endScore += (Tf * idf * P);
00102 }
00103 endScore = endScore / 1+psgLen;
00104 psg.score = endScore;
00105 return endScore;
00106 }
00107
00109 void findNextPassage(BasicPassage &psg, InvFPIndex* idx,
00110 TermInfoList* tList, int eos) {
00111 TermInfo* tEntry;
00112 psg.clear();
00113 termCount* storage;
00114 if (eos) {
00115 while (tList->hasMore()) {
00116 tEntry = tList->nextEntry();
00117 if ( isEOS(idx->term(tEntry->id())) ) return;
00118 storage = new termCount;
00119 storage->termID = tEntry->id();
00120 storage->tf = tEntry->count();
00121 psg.addTerm(*storage);
00122 }
00123 } else {
00124 for(int i=0; i < PSG_LEN; i++) {
00125 if (tList->hasMore()) {
00126 tEntry = tList->nextEntry();
00127 storage = new termCount;
00128 storage->termID = tEntry->id();
00129 storage->tf = tEntry->count();
00130 psg.addTerm(*storage);
00131 } else {
00132 return;
00133 }
00134 }
00135 }
00136 return;
00137 }
00138
00140 void showPassage(passageVec* psg, InvFPIndex* idx) {
00141 for (int i=0; i < psg->size(); i++) {
00142 cout << idx->term((*psg)[i].termID) << " ";
00143 }
00144 }
00145
00147 void showMarkedPassages() {
00148
00149 for (int i=0; i<doc.size(); i++) {
00150 if (doc[i].marked > 0) {
00151 showPassage(doc[i].getAsVector(), idx);
00152 cout << endl;
00153 }
00154 }
00155 }
00156
00157 };
00158
00159 #endif