IndriIndex.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 
00014 #ifndef INDRI_INDEX_HPP
00015 #define INDRI_INDEX_HPP
00016 
00017 /*
00018  * NAME DATE - COMMENTS
00019  * tnt 01/02 - created
00020  * dmf 07/03 - converted to incremental berkeley db btree indexer with
00021  * integrated document manager.
00022  * tds 09/03 - modified from BTIncIndex to use keyfile
00023 */
00024 
00025 #include "TermInfoList.hpp"
00026 #include "DocInfoList.hpp"
00027 #include "lemur-platform.h"
00028 #include "lemur-compat.hpp"
00029 #include "indri/DocPositionInfoList.hpp"
00030 #include "Exception.hpp"
00031 #include "Keyfile.hpp"
00032 #include "ReadBuffer.hpp"
00033 #include "WriteBuffer.hpp"
00034 
00035 #include <queue>
00036 #include <map>
00037 
00038 #include "indri/DocListMemoryBuilder.hpp"
00039 #include "indri/DocListDiskBuilder.hpp"
00040 #include "indri/HashTable.hpp"
00041 #include <indri/greedy_vector>
00042 
00043 #include "indri/TermData.hpp"
00044 #include "indri/TermFieldStatistics.hpp"
00045 #include "indri/KeyfileWordMap.hpp"
00046 #include "indri/DocumentData.hpp"
00047 
00048 #include "indri/CorpusStatistics.hpp"
00049 #include "indri/FieldStatistics.hpp"
00050 #include "indri/FieldData.hpp"
00051 #include "indri/TermListBuilder.hpp"
00052 #include "indri/FieldListIterator.hpp"
00053 #include "indri/DocListFrequencyIterator.hpp"
00054 
00055 #include "indri/TagExtent.hpp"
00056 
00057 
00058 struct ParsedDocument;
00059 
00060 #define INDRI_DEFAULT_QUERY_PROPORTION     (0.5)
00061 #define INDRI_DEFAULT_MEMORY_SIZE          (128*1024*1024)
00062 #define INDRI_OUTRAGEOUS_DOCLENGTH         (1000*1000*1000)
00063 
00064 struct less_string {
00065   bool operator () ( const char* one, const char* two ) const {
00066     return strcmp( one, two ) < 0;
00067   }
00068 };
00069 
00072 class IndriIndex {
00073 private:
00074   struct term_cache_entry {
00075     TERMID_T termID;
00076     indri::index::TermData* termData;
00077     char term[0];
00078   };
00079 
00080 public:
00082   struct FieldDescription {
00084     std::string name;
00086     bool numeric;
00087   };
00091   IndriIndex( size_t memorySize = INDRI_DEFAULT_MEMORY_SIZE, float queryProportion = INDRI_DEFAULT_QUERY_PROPORTION);  
00092   ~IndriIndex();
00093 
00095   void setName(const std::string& prefix);
00099   DOCID_T addDocument( struct ParsedDocument* document );
00100 
00101   // still support this call in the transition period
00107   DOCID_T addDocument( const char* documentName, const greedy_vector<char*>& words, const greedy_vector<TagExtent>& tagExtents );
00108 
00110 
00111 
00113   bool open(const std::string& indexName);
00115   bool open( const char* indexName );
00117   bool openRead(const std::string& indexName);
00118 
00119 
00121   bool create(const std::string& indexName );
00122 
00124   bool create(const std::string& indexName, const std::vector<FieldDescription>& fields );
00126   void close();
00127 
00129 
00131 
00132 
00134   TERMID_T term(const TERM_T &word) const;
00135 
00137   const TERM_T term(TERMID_T termID) const;
00138 
00140   DOCID_T document(const EXDOCID_T &docIDStr) const;
00141 
00143   const EXDOCID_T document(DOCID_T docID) const; 
00144 
00146   const char* field(int fieldID);
00147 
00149   int field( const char* fieldName );
00150 
00152   int field( const std::string& fieldName );
00153 
00155 
00157 
00158 
00160   COUNT_T docCount() const { return _corpusStatistics.totalDocuments; };
00161 
00163   COUNT_T termCountUnique() const { return _corpusStatistics.uniqueTerms; };
00164 
00166   INT64 termCount(TERMID_T termID) const;
00167 
00169   INT64 termCount() const { return _corpusStatistics.totalTerms; };
00170 
00172   INT64 fieldTermCount(int fieldID, TERMID_T termID) const;
00173 
00175   INT64 fieldTermCount(int fieldID) const;
00176 
00178   INT64 fieldDocCount(int fieldID) const;
00179 
00181   INT64 fieldDocCount(int fieldID, TERMID_T termID) const;
00182 
00184   double docLengthAvg() const;
00185 
00187   COUNT_T docCount(TERMID_T termID) const;
00189   COUNT_T docIndexedLength( DOCID_T documentID ) const;
00191   COUNT_T docLength( DOCID_T documentID ) const;
00192 
00194   int termMaxDocumentFrequency( TERMID_T termID );
00195 
00197   int termMinDocumentLength( TERMID_T termID );
00198 
00200   double termMaxDocumentFraction( TERMID_T termID );
00201 
00203   int maxDocumentLength();
00204 
00206 
00208   DocInfoList* docInfoList(TERMID_T termID) const;
00210   DocPositionInfoList* docPositionInfoList(TERMID_T termID);
00212   indri::index::DocListFrequencyIterator* docFrequencyInfoList(TERMID_T termID);
00213 
00215   TermInfoList* termInfoList(DOCID_T docID) const;
00217   TermInfoList* termInfoListSeq(DOCID_T docID) const;
00218 
00220   indri::index::TermListBuilder* termPositionList(DOCID_T docID);
00221 
00223   indri::index::FieldListIterator* fieldPositionListIterator( int fieldID );
00224 
00225 protected:
00226   bool _readOnly;
00227 
00228   void _writeCache();
00229   void _writeAndMerge();
00230 
00231   void _writeBatchSegment();
00232   void _mergeBatch();
00233   void _mergeBatchSegments( int start, int end, int newNumber, bool finalMerge );
00234   void _mergeBatchTermLists( const std::vector<int>& segmentMapping );
00235   
00236   void _writeIncrementalSegment();
00237   void _mergeIncrementalSegments();
00238 
00239   void _readTermMapping( greedy_vector<int>& mapping, int segment, int secondSegment );
00240 
00241   void _openMergeFiles( int startSegment,
00242                         int endSegment,
00243                         std::vector<File*>& listFiles,
00244                         std::vector<File*>& statsFiles,
00245                         std::vector<File*>& mappingFiles,
00246                         std::vector<WriteBuffer*>& mappingBuffers,
00247                         std::vector<ReadBuffer*>& statsBuffers,
00248                         std::vector<indri::index::DocListFileIterator*>& listIterators,
00249                         std::vector<char*>& terms,
00250                         std::vector<indri::index::TermData*>& termDatas,
00251                         bool finalMerge );
00252 
00253   void _openDBs();
00254   void _openReadOnlyDBs();
00255   void _openSegments();
00256   void _createDBs();
00257   void _createFields( const std::vector<FieldDescription>& fieldNames );
00258   void _closeFields();
00259 
00260   indri::index::DocumentData fetchDocumentData( int key ) const;
00261   int fetchDocumentLength( int key ) const;
00262 
00263   void _updateTermlist( TERMID_T termID, int position );
00264   int _updateTermData( int documentLength );
00265   size_t _cacheSize();
00266   void _computeMemoryBounds( size_t memorySize, float queryProportion );
00267   void _resetEstimatePoint();
00268 
00269   // special handling for term data, since it varies in size
00270   // based on the number of indexed fields
00271   indri::index::TermData* _createTermData();
00272   indri::index::TermData* _fetchTermData(TERMID_T termID);
00273   indri::index::TermData* _lookupTermData(TERMID_T termID);
00274   void _cleanCache();
00275   void _deleteTermData( indri::index::TermData* termData );
00276   size_t _sizeTermData();
00277   void _clearTermData();
00278 
00279   // special handling for cache data
00280   void _clearTermCache();
00281   void _storeTermCache( const char* term, TERMID_T termID, indri::index::TermData*& termData );
00282 
00283   void _flushTermStatistics( TERMID_T termID, const indri::index::TermFieldStatistics& statistics );
00284   void _addTermDataToBuilder( indri::index::DocListDiskBuilder& builder, indri::index::DocListFileIterator& iterator, int writingID, int readingID );
00285 
00286   // addDocument helpers
00287   void _addOpenTags( greedy_vector<indri::index::FieldExtent>& indexedTags,
00288                      greedy_vector<indri::index::FieldExtent>& openTags,
00289                      const greedy_vector<TagExtent>& extents,
00290                      unsigned int& extentIndex,
00291                      unsigned int position );
00292   void _removeClosedTags( greedy_vector<indri::index::FieldExtent>& tags, unsigned int position );
00293   void _lookupTerm( const char* term, TERMID_T& termID, indri::index::TermData*& termData );
00294   void _finishDocument( greedy_vector<indri::index::TermFieldStatistics*>& seenStatistics );
00295   void _writeDocumentTermList( File::offset_type& offset, int& byteLength, DOCID_T documentID, int documentLength, indri::index::TermListBuilder& locatedTerms );
00296   void _writeDocumentStatistics( File::offset_type offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms );
00297   void _handleCache();
00298   int _lookupTag( const char* tag );
00299 
00300   bool _readTermData( TERMID_T& termID, char* termBuffer, indri::index::TermData* termData, ReadBuffer* termDataFile );
00301   void _incrementalWriteTermData( TERMID_T termID, indri::index::TermData* termData );
00302   void _batchWriteTermData( TERMID_T termID, indri::index::TermData* termData, WriteBuffer* file );
00303   int _compressTermData( char* buffer, int size, indri::index::TermData* termData );
00304   void _decompressTermData( const char* buffer, int size, indri::index::TermData* termData );
00305 
00306   void _writeParameters( const std::string& fileName );
00307   bool _readParameters( const std::string& fileName );
00308 
00309   
00310   void _openDocumentFiles();
00311   std::string _buildFileName( const char* suffix );
00312   std::string _buildFileName( const char* suffix, int index );
00313 
00314   // count statistics
00315   indri::index::CorpusStatistics _corpusStatistics;
00316   std::vector<indri::index::FieldData*> _fieldData;
00317   std::map<const char*, int, less_string> _fieldLookup;
00318 
00319   // General index parameters and state
00320   std::string _baseName;    
00321   bool _writingDocTermLists;
00322 
00323   // ---- Disk-based structures --------------------
00324 
00325   // All database handles are marked mutable since they sometimes
00326   // must be used to fetch values during const methods
00327   mutable Keyfile _termDataStore;         // termID -> indri::index::TermData (term statistics and inverted list segment offsets)
00328   mutable KeyfileWordMap _documentMap;    // documentID <-> documentString
00329   mutable KeyfileWordMap _termMap;        // termID <-> termString
00330   mutable File* _documentStatisticsFile;  // document statistics (document length, etc.)
00331   mutable File _documentLengthFile;       // document length only
00332 
00333   std::vector<File*> _segments;           // inverted list segment files
00334   int _batchSegmentCount;                 // count of segments
00335   File* _documentTermLocationsFile;       // filestream for writing the list of located terms
00336 
00337   // ---- Current document structures --------------
00338   //    (since IndriIndex is now doc-at-a-time, the main reason for keeping
00339   //    these here is to reduce allocation overhead)
00340 
00341   indri::index::TermListBuilder _termList;
00342   Buffer _termListBuffer;
00343 
00344   greedy_vector<indri::index::TermData*> _seenTerms;
00345 
00346   // ---- Cache structures -------------------------
00347   HashTable<int, indri::index::TermData*>* _termDataTable; 
00348   HashTable<const char*, term_cache_entry*>* _cache;
00349 
00350   ReadBuffer* _documentStatisticsBuffer; // read buffer for document stats
00351   ReadBuffer* _documentLengthBuffer;     // read buffer for document length
00352 
00353   // ---- Memory/cache management parameters -------
00354   size_t _listsSize; // memory for use by inverted list buffers
00355   size_t _memorySize; // upper bound for memory use
00356   size_t _termDataSize; // memory bound for termdata
00357   size_t _termCacheSize; // memory bound for termcache
00358   size_t _statisticsBufferSize; // memory bound for _documentStatisticsBuffer
00359   size_t _lengthBufferSize; // memory bound for _documentLengthBuffer
00360   float _queryProportion; // proportion of load assumed to be queries
00361   bool _batchBuild;
00362 
00363   INT64 _estimatePoint;  
00364   INT64 _lastCacheFlush; 
00365 };
00366 
00367 
00368 #endif // INDRI_INDEX_HPP
00369