Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

BasicSumm.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2002 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 #include <iomanip>
00013 #include "Summarizer.hpp"
00014 #include "Passage.hpp"
00015 #include "BasicPassage.hpp"
00016 #include "InvFPIndex.hpp"
00017 #include <algorithm>
00018 #include <vector>
00019 
00020 using std::vector;
00021 
00022 #if (defined(WIN32) && !defined(min))
00023 #define min(x,y) __min(x,y)
00024 #endif
00025 
00026 #ifndef _BASICSUMM_HPP
00027 #define _BASICSUMM_HPP
00028 
00029 #define EOS      "*eos"
00030 #define PSG_LEN  15
00031 
00036 class BasicSumm : public Summarizer {
00037 
00038 private:
00039   InvFPIndex* idx;
00040   int summLen;
00041   vector<BasicPassage> doc;
00042   int iterCount;
00043 
00044 public:
00046   BasicSumm(InvFPIndex* inIdx, int inSummLen = 5) {
00047     idx = inIdx;
00048     summLen = inSummLen;
00049     iterCount = 1;
00050   };
00051 
00052   virtual void summDocument(const char* docID, const int optLen, const char* qInfo);
00053 
00054   virtual void scorePassages(const char* qInfo);
00055 
00056   virtual void markPassages(int optLen, char* qInfo);
00057 
00058   virtual void addPassage(Passage &psg);
00059 
00060   virtual void clear(void);
00061 
00062   virtual int fetchPassages(Passage* psgs, int optLen);
00063 
00064   virtual int nextPassage(Passage* psg);
00065 
00066   virtual void iterClear(void);
00067 
00068   virtual void outputSumm(void);
00069 
00071   int isEOS(const char* check) {
00072     return !strcmp(check, EOS);
00073   }
00074 
00076   int hasEOS(InvFPIndex* idx, TermInfoList* tList) {
00077     tList->startIteration();
00078     TermInfo* tEntry;
00079     while (tList->hasMore()) {
00080       tEntry = tList->nextEntry();
00081       if ( isEOS(idx->term(tEntry->id())) ) return true;
00082     }
00083     return false;
00084   }
00085 
00087   double scorePassage(BasicPassage &psg, char* qInfo) {
00088     char* docID = psg.docID;
00089     passageVec psgV= *psg.getAsVector();
00090     double psgLen = psgV.size();
00091     double P = 1;  // no markup yet, all get same weight
00092     double M = 1.5;
00093     double endScore, Tf, tf, idf, docLen, avgDocLen;
00094     endScore = 0.0;
00095     for (int i=0; i < psgLen; i++) {
00096       docLen = idx->docLength(idx->document(docID));
00097       avgDocLen = idx->docLengthAvg();
00098       tf = psgV[i].tf;
00099       Tf = tf / (tf + 0.5 + 1.5 * (docLen/avgDocLen) );
00100       idf = min(M, log((double)idx->docCount()/(double)idx->docCount(psgV[i].termID))); 
00101       endScore += (Tf * idf * P);
00102     }
00103     endScore = endScore / 1+psgLen;
00104     psg.score = endScore;
00105     return endScore;
00106   }
00107 
00109   void findNextPassage(BasicPassage &psg, InvFPIndex* idx, 
00110                        TermInfoList* tList, int eos) {
00111     TermInfo* tEntry;
00112     psg.clear();
00113     termCount* storage;
00114     if (eos) {
00115       while (tList->hasMore()) {
00116         tEntry = tList->nextEntry();
00117         if ( isEOS(idx->term(tEntry->id())) ) return;
00118         storage = new termCount;
00119         storage->termID = tEntry->id();
00120         storage->tf = tEntry->count();
00121         psg.addTerm(*storage);
00122       }
00123     } else {
00124       for(int i=0; i < PSG_LEN; i++) {
00125         if (tList->hasMore()) {
00126           tEntry = tList->nextEntry();
00127           storage = new termCount;
00128           storage->termID = tEntry->id();
00129           storage->tf = tEntry->count();
00130           psg.addTerm(*storage);
00131         } else {
00132           return;
00133         }
00134       }
00135     }
00136     return;
00137   }
00138  
00140   void showPassage(passageVec* psg, InvFPIndex* idx) {
00141     for (int i=0; i < psg->size(); i++) {
00142       cout << idx->term((*psg)[i].termID) << " ";
00143     }
00144   }
00145 
00147   void showMarkedPassages() {
00148     
00149     for (int i=0; i<doc.size(); i++) {
00150       if (doc[i].marked > 0) {
00151         showPassage(doc[i].getAsVector(), idx);
00152         cout << endl;
00153       }
00154     }
00155   }
00156 
00157 }; // BasicSumm
00158 
00159 #endif

Generated on Fri Feb 6 07:11:45 2004 for LEMUR by doxygen1.2.16