00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _LEMUR_KEYFILE_INCINDEX_HPP
00014 #define _LEMUR_KEYFILE_INCINDEX_HPP
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "common_headers.hpp"
00025 #include "Index.hpp"
00026 #include "InvFPDocList.hpp"
00027 #include "InvFPTermList.hpp"
00028 #include "InvFPTypes.hpp"
00029 #include "Param.hpp"
00030 #include "PushIndex.hpp"
00031 #include "MemCache.hpp"
00032 #include "Keyfile.hpp"
00033 #include "KeyfileDocMgr.hpp"
00034 #include "ReadBuffer.hpp"
00035 #include "WriteBuffer.hpp"
00036 #include "TermCache.hpp"
00037 #include <cstring>
00038 #include <queue>
00039
00040
00041 #define UNIQUE_TERMS 0
00042 #define TOTAL_TERMS 1
00043 #define DOCS 2
00044 #define DT_FILES 3
00045 #define INV_FILES 4
00046 #define MAX_DOCID_LENGTH 256
00047 #define MAX_TERM_LENGTH 256
00048
00049 #define KEYFILE_MAX_SEGMENTS (16)
00050
00051
00052
00065 class KeyfileIncIndex : public PushIndex, public Index {
00066 public:
00068 class record {
00069 public:
00071 File::offset_type offset;
00073 int len;
00075 int totalLen;
00077 int num;
00078 };
00080 struct SegmentOffset {
00082 unsigned int segment;
00084 unsigned int length;
00086 File::offset_type offset;
00087 };
00089 struct TermData {
00091 unsigned int totalCount;
00093 unsigned int documentCount;
00095 SegmentOffset segments[ KEYFILE_MAX_SEGMENTS ];
00096 };
00098 KeyfileIncIndex(const char* indexName = 0);
00101 KeyfileIncIndex(char* prefix, int cachesize=128000000,
00102 DOCID_T startdocid=1);
00104 ~KeyfileIncIndex();
00105
00107 void setName(char* prefix);
00108
00110 bool beginDoc(DocumentProps* dp);
00111
00113 bool addTerm(Term& t);
00114
00116 void endDoc(DocumentProps* dp);
00117
00119 virtual void endDoc(DocumentProps* dp, const char* mgr);
00120
00122 void endCollection(CollectionProps* cp);
00123
00125 void setDocManager(const char* mgrID);
00126
00127 protected:
00129 bool tryOpen();
00131 void writeTOC();
00133 void writeCache( bool lastRun = false );
00135 void lastWriteCache();
00136
00138 void mergeCacheSegments();
00140 void writeCacheSegment();
00142 void writeDocMgrIDs();
00145 int docMgrID(const char* mgr);
00147 virtual void doendDoc(DocumentProps* dp, int mgrid);
00149 int listlengths;
00150
00151 public:
00153
00154
00156 bool open(const char* indexName);
00158
00160
00161
00163 int term(const char* word);
00164
00166 const char* term(int termID);
00167
00169 int document(const char* docIDStr);
00170
00172 const char* document(int docID);
00173
00175 DocumentManager *docManager(int docID);
00176
00178
00180
00181
00183 int docCount() { return counts[DOCS]; };
00184
00186 int termCountUnique() { return counts[UNIQUE_TERMS]; };
00187
00189 int termCount(int termID) const;
00190
00192 int termCount() const { return counts[TOTAL_TERMS]; };
00193
00195 float docLengthAvg();
00196
00198 int docCount(int termID);
00199
00201 int docLength(DOCID_T docID) const;
00202
00204 virtual int totaldocLength (int docID) const;
00205
00207 int docLengthCounted(int docID);
00208
00210
00212
00213
00214 DocInfoList* docInfoList(int termID);
00215
00217 TermInfoList* termInfoList(int docID);
00219 TermInfoList* termInfoListSeq(int docID);
00220
00222
00224 void setMesgStream(ostream * lemStream);
00226 void addKnownTerm( int termID, int position );
00228 int addUnknownTerm( InvFPTerm* term );
00230 int addUncachedTerm( InvFPTerm* term );
00231
00232 protected:
00234 void openDBs();
00236 void openSegments();
00238 void createDBs();
00239
00241 void fullToc();
00243 bool docMgrIDs();
00245 record fetchDocumentRecord( int key ) const;
00247 void addDocumentLookup( int documentKey, const char* documentName );
00249 void addTermLookup( int termKey, const char* termSpelling );
00251 void addGeneralLookup( Keyfile& numberNameIndex, Keyfile& nameNumberIndex,
00252 int number, const char* name );
00254 InvFPDocList* internalDocInfoList(int termID);
00256 void _updateTermlist( InvFPDocList* curlist, int position );
00258 int _cacheSize();
00260 void _computeMemoryBounds( int memorySize );
00262 void _resetEstimatePoint();
00264 int* counts;
00266 std::vector<std::string> names;
00268 float aveDocLen;
00270 vector<std::string> docmgrs;
00272 ostream* msgstream;
00273
00274
00275
00277
00278
00279
00281
00283
00285
00287
00289
00291
00293
00294
00296 char termKey[MAX_TERM_LENGTH];
00298 char docKey[MAX_DOCID_LENGTH];
00300 int _listsSize;
00302 int _memorySize;
00304 std::string name;
00306 vector<InvFPDocList*> invertlists;
00308 vector<LocatedTerm> termlist;
00310 int curdocmgr;
00312 vector<DocumentManager*> docMgrs;
00314 TermCache _cache;
00315
00317 std::vector<File*> _segments;
00319 int _largestFlushedTermID;
00321 int _estimatePoint;
00322 };
00323
00324
00325 #endif //_LEMUR_KEYFILE_INCINDEX_HPP