Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

KeyfileIncIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _LEMUR_KEYFILE_INCINDEX_HPP
00014 #define _LEMUR_KEYFILE_INCINDEX_HPP
00015 
00016 /*
00017  * NAME DATE - COMMENTS
00018  * tnt 01/02 - created
00019  * dmf 07/03 - converted to incremental berkeley db btree indexer with
00020  * integrated document manager.
00021  * tds 09/03 - modified from BTIncIndex to use keyfile
00022  * dmf 12/03 - update to 2.1 API and remove parser/docmgr components.
00023 */
00024 #include "common_headers.hpp"
00025 #include "Index.hpp"
00026 #include "InvFPDocList.hpp"
00027 #include "InvFPTermList.hpp"
00028 #include "InvFPTypes.hpp"
00029 #include "Param.hpp"
00030 #include "PushIndex.hpp"
00031 #include "MemCache.hpp"
00032 #include "Keyfile.hpp"
00033 #include "KeyfileDocMgr.hpp"
00034 #include "ReadBuffer.hpp"
00035 #include "WriteBuffer.hpp"
00036 #include "TermCache.hpp"
00037 #include <cstring>
00038 #include <queue>
00039 
00040 // for counts array
00041 #define UNIQUE_TERMS 0
00042 #define TOTAL_TERMS  1
00043 #define DOCS         2
00044 #define DT_FILES     3
00045 #define INV_FILES    4
00046 #define MAX_DOCID_LENGTH 256
00047 #define MAX_TERM_LENGTH 256
00048 
00049 #define KEYFILE_MAX_SEGMENTS (16)
00050 
00051 // we love multiple inheritance
00052 
00065 class KeyfileIncIndex : public PushIndex, public Index {
00066 public:
00068   class record {
00069   public:
00071     File::offset_type offset;
00073     int len;
00075     int totalLen;
00077     int num;     
00078   };
00080   struct SegmentOffset {
00082     unsigned int segment;
00084     unsigned int length;
00086     File::offset_type offset;
00087   };
00089   struct TermData {
00091     unsigned int totalCount;
00093     unsigned int documentCount;
00095     SegmentOffset segments[ KEYFILE_MAX_SEGMENTS ];
00096   };
00098   KeyfileIncIndex(const char* indexName = 0); 
00101   KeyfileIncIndex(char* prefix, int cachesize=128000000, 
00102                   DOCID_T startdocid=1);
00104   ~KeyfileIncIndex();
00105 
00107   void setName(char* prefix);
00108 
00110   bool beginDoc(DocumentProps* dp);
00111 
00113   bool addTerm(Term& t);
00114 
00116   void endDoc(DocumentProps* dp);
00117 
00119   virtual void endDoc(DocumentProps* dp, const char* mgr);
00120 
00122   void endCollection(CollectionProps* cp);
00123 
00125   void setDocManager(const char* mgrID);
00126     
00127 protected:
00129   bool tryOpen();
00131   void writeTOC();
00133   void writeCache( bool lastRun = false );
00135   void lastWriteCache();
00136 
00138   void mergeCacheSegments();
00140   void writeCacheSegment();
00142   void writeDocMgrIDs();
00145   int docMgrID(const char* mgr);
00147   virtual void doendDoc(DocumentProps* dp, int mgrid);
00149   int listlengths;
00150   
00151 public:
00153 
00154 
00156   bool open(const char* indexName);
00158 
00160 
00161 
00163   int term(const char* word);
00164 
00166   const char* term(int termID);
00167 
00169   int document(const char* docIDStr);
00170 
00172   const char* document(int docID); 
00173 
00175   DocumentManager *docManager(int docID);
00176 
00178 
00180 
00181 
00183   int docCount() { return counts[DOCS]; };
00184 
00186   int termCountUnique() { return counts[UNIQUE_TERMS]; };
00187 
00189   int termCount(int termID) const;
00190 
00192   int termCount() const { return counts[TOTAL_TERMS]; };
00193 
00195   float docLengthAvg();
00196 
00198   int docCount(int termID);
00199 
00201   int docLength(DOCID_T docID) const; // should use DOCID_T everywhere...
00202 
00204   virtual int totaldocLength (int docID) const;
00205 
00207   int docLengthCounted(int docID);
00208 
00210 
00212 
00213 
00214   DocInfoList* docInfoList(int termID);
00215 
00217   TermInfoList* termInfoList(int docID);
00219   TermInfoList* termInfoListSeq(int docID);
00220 
00222 
00224   void setMesgStream(ostream * lemStream);
00226   void addKnownTerm( int termID, int position );
00228   int addUnknownTerm( InvFPTerm* term );
00230   int addUncachedTerm( InvFPTerm* term );
00231 
00232 protected:
00234   void openDBs();
00236   void openSegments();
00238   void createDBs();
00239 
00241   void fullToc();
00243   bool docMgrIDs();
00245   record fetchDocumentRecord( int key ) const;
00247   void addDocumentLookup( int documentKey, const char* documentName );
00249   void addTermLookup( int termKey, const char* termSpelling );
00251   void addGeneralLookup( Keyfile& numberNameIndex, Keyfile& nameNumberIndex, 
00252                          int number, const char* name );
00254   InvFPDocList* internalDocInfoList(int termID);
00256   void _updateTermlist( InvFPDocList* curlist, int position );
00258   int _cacheSize();
00260   void _computeMemoryBounds( int memorySize );
00262   void _resetEstimatePoint();
00264   int* counts;    
00266   std::vector<std::string> names;
00268   float aveDocLen; 
00270   vector<std::string> docmgrs;
00272   ostream* msgstream;
00273 
00274   // All database handles are marked mutable since they sometimes
00275   // must be used to fetch values during const methods
00277   mutable Keyfile invlookup;
00278   
00279   // int <-> string mappings for documents and terms
00281   mutable Keyfile dIDs;
00283   mutable Keyfile dSTRs;
00285   mutable Keyfile tIDs;
00287   mutable Keyfile tSTRs;
00289   mutable File dtlookup; 
00291   ReadBuffer* dtlookupReadBuffer; 
00293   File writetlist; 
00294 
00296   char termKey[MAX_TERM_LENGTH];
00298   char docKey[MAX_DOCID_LENGTH];
00300   int _listsSize;
00302   int _memorySize;
00304   std::string name;
00306   vector<InvFPDocList*> invertlists; 
00308   vector<LocatedTerm> termlist; 
00310   int curdocmgr; 
00312   vector<DocumentManager*> docMgrs; 
00314   TermCache _cache;
00315 
00317   std::vector<File*> _segments;
00319   int _largestFlushedTermID;
00321   int _estimatePoint; 
00322 };
00323 
00324 
00325 #endif //_LEMUR_KEYFILE_INCINDEX_HPP

Generated on Fri Feb 6 07:11:47 2004 for LEMUR by doxygen1.2.16