00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #ifndef INDRI_INDEX_HPP
00015 #define INDRI_INDEX_HPP
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "TermInfoList.hpp"
00026 #include "DocInfoList.hpp"
00027 #include "lemur-platform.h"
00028 #include "lemur-compat.hpp"
00029 #include "indri/DocPositionInfoList.hpp"
00030 #include "Exception.hpp"
00031 #include "Keyfile.hpp"
00032 #include "ReadBuffer.hpp"
00033 #include "WriteBuffer.hpp"
00034
00035 #include <queue>
00036 #include <map>
00037
00038 #include "indri/DocListMemoryBuilder.hpp"
00039 #include "indri/DocListDiskBuilder.hpp"
00040 #include "indri/HashTable.hpp"
00041 #include <indri/greedy_vector>
00042
00043 #include "indri/TermData.hpp"
00044 #include "indri/TermFieldStatistics.hpp"
00045 #include "indri/KeyfileWordMap.hpp"
00046 #include "indri/DocumentData.hpp"
00047
00048 #include "indri/CorpusStatistics.hpp"
00049 #include "indri/FieldStatistics.hpp"
00050 #include "indri/FieldData.hpp"
00051 #include "indri/TermListBuilder.hpp"
00052 #include "indri/FieldListIterator.hpp"
00053 #include "indri/DocListFrequencyIterator.hpp"
00054
00055 #include "indri/TagExtent.hpp"
00056
00057
00058 struct ParsedDocument;
00059
00060 #define INDRI_DEFAULT_QUERY_PROPORTION (0.5)
00061 #define INDRI_DEFAULT_MEMORY_SIZE (128*1024*1024)
00062 #define INDRI_OUTRAGEOUS_DOCLENGTH (1000*1000*1000)
00063
00064 struct less_string {
00065 bool operator () ( const char* one, const char* two ) const {
00066 return strcmp( one, two ) < 0;
00067 }
00068 };
00069
00072 class IndriIndex {
00073 private:
00074 struct term_cache_entry {
00075 TERMID_T termID;
00076 indri::index::TermData* termData;
00077 char term[0];
00078 };
00079
00080 public:
00082 struct FieldDescription {
00084 std::string name;
00086 bool numeric;
00087 };
00091 IndriIndex( size_t memorySize = INDRI_DEFAULT_MEMORY_SIZE, float queryProportion = INDRI_DEFAULT_QUERY_PROPORTION);
00092 ~IndriIndex();
00093
00095 void setName(const std::string& prefix);
00099 DOCID_T addDocument( struct ParsedDocument* document );
00100
00101
00107
00108
00110
00111
00113 bool open(const std::string& indexName);
00115 bool open( const char* indexName );
00117 bool openRead(const std::string& indexName);
00118
00119
00121 bool create(const std::string& indexName );
00122
00124 bool create(const std::string& indexName, const std::vector<FieldDescription>& fields );
00126 void close();
00127
00129
00131
00132
00134 TERMID_T term(const TERM_T &word) const;
00135
00137 const TERM_T term(TERMID_T termID) const;
00138
00140 DOCID_T document(const EXDOCID_T &docIDStr) const;
00141
00143 const EXDOCID_T document(DOCID_T docID) const;
00144
00146 const char* field(int fieldID);
00147
00149 int field( const char* fieldName );
00150
00152 int field( const std::string& fieldName );
00153
00155
00157
00158
00160 COUNT_T docCount() const { return _corpusStatistics.totalDocuments; };
00161
00163 COUNT_T termCountUnique() const { return _corpusStatistics.uniqueTerms; };
00164
00166 INT64 termCount(TERMID_T termID) const;
00167
00169 INT64 termCount() const { return _corpusStatistics.totalTerms; };
00170
00172 INT64 fieldTermCount(int fieldID, TERMID_T termID) const;
00173
00175 INT64 fieldTermCount(int fieldID) const;
00176
00178 INT64 fieldDocCount(int fieldID) const;
00179
00181 INT64 fieldDocCount(int fieldID, TERMID_T termID) const;
00182
00184 double docLengthAvg() const;
00185
00187 COUNT_T docCount(TERMID_T termID) const;
00189 COUNT_T docIndexedLength( DOCID_T documentID ) const;
00191 COUNT_T docLength( DOCID_T documentID ) const;
00192
00194 int termMaxDocumentFrequency( TERMID_T termID );
00195
00197 int termMinDocumentLength( TERMID_T termID );
00198
00200 double termMaxDocumentFraction( TERMID_T termID );
00201
00203 int maxDocumentLength();
00204
00206
00208 DocInfoList* docInfoList(TERMID_T termID) const;
00210 DocPositionInfoList* docPositionInfoList(TERMID_T termID);
00212 indri::index::DocListFrequencyIterator* docFrequencyInfoList(TERMID_T termID);
00213
00215 TermInfoList* termInfoList(DOCID_T docID) const;
00217 TermInfoList* termInfoListSeq(DOCID_T docID) const;
00218
00220 indri::index::TermListBuilder* termPositionList(DOCID_T docID);
00221
00223 indri::index::FieldListIterator* fieldPositionListIterator( int fieldID );
00224
00225 protected:
00226 bool _readOnly;
00227
00228 void _writeCache();
00229 void _writeAndMerge();
00230
00231 void _writeBatchSegment();
00232 void _mergeBatch();
00233 void _mergeBatchSegments( int start, int end, int newNumber, bool finalMerge );
00234 void _mergeBatchTermLists( const std::vector<int>& segmentMapping );
00235
00236 void _writeIncrementalSegment();
00237 void _mergeIncrementalSegments();
00238
00239 void _readTermMapping( greedy_vector<int>& mapping, int segment, int secondSegment );
00240
00241 void _openMergeFiles( int startSegment,
00242 int endSegment,
00243 std::vector<File*>& listFiles,
00244 std::vector<File*>& statsFiles,
00245 std::vector<File*>& mappingFiles,
00246 std::vector<WriteBuffer*>& mappingBuffers,
00247 std::vector<ReadBuffer*>& statsBuffers,
00248 std::vector<indri::index::DocListFileIterator*>& listIterators,
00249 std::vector<char*>& terms,
00250 std::vector<indri::index::TermData*>& termDatas,
00251 bool finalMerge );
00252
00253 void _openDBs();
00254 void _openReadOnlyDBs();
00255 void _openSegments();
00256 void _createDBs();
00257 void _createFields( const std::vector<FieldDescription>& fieldNames );
00258 void _closeFields();
00259
00260 indri::index::DocumentData fetchDocumentData( int key ) const;
00261 int fetchDocumentLength( int key ) const;
00262
00263 void _updateTermlist( TERMID_T termID, int position );
00264 int _updateTermData( int documentLength );
00265 size_t _cacheSize();
00266 void _computeMemoryBounds( size_t memorySize, float queryProportion );
00267 void _resetEstimatePoint();
00268
00269
00270
00271 indri::index::TermData* _createTermData();
00272 indri::index::TermData* _fetchTermData(TERMID_T termID);
00273 indri::index::TermData* _lookupTermData(TERMID_T termID);
00274 void _cleanCache();
00275 void _deleteTermData( indri::index::TermData* termData );
00276 size_t _sizeTermData();
00277 void _clearTermData();
00278
00279
00280 void _clearTermCache();
00281 void _storeTermCache( const char* term, TERMID_T termID, indri::index::TermData*& termData );
00282
00283 void _flushTermStatistics( TERMID_T termID, const indri::index::TermFieldStatistics& statistics );
00284 void _addTermDataToBuilder( indri::index::DocListDiskBuilder& builder, indri::index::DocListFileIterator& iterator, int writingID, int readingID );
00285
00286
00287 void _addOpenTags( greedy_vector<indri::index::FieldExtent>& indexedTags,
00288 greedy_vector<indri::index::FieldExtent>& openTags,
00289 const greedy_vector<TagExtent>& extents,
00290 unsigned int& extentIndex,
00291 unsigned int position );
00292 void _removeClosedTags( greedy_vector<indri::index::FieldExtent>& tags, unsigned int position );
00293 void _lookupTerm( const char* term, TERMID_T& termID, indri::index::TermData*& termData );
00294 void _finishDocument( greedy_vector<indri::index::TermFieldStatistics*>& seenStatistics );
00295 void _writeDocumentTermList( File::offset_type& offset, int& byteLength, DOCID_T documentID, int documentLength, indri::index::TermListBuilder& locatedTerms );
00296 void _writeDocumentStatistics( File::offset_type offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms );
00297 void _handleCache();
00298 int _lookupTag( const char* tag );
00299
00300 bool _readTermData( TERMID_T& termID, char* termBuffer, indri::index::TermData* termData, ReadBuffer* termDataFile );
00301 void _incrementalWriteTermData( TERMID_T termID, indri::index::TermData* termData );
00302 void _batchWriteTermData( TERMID_T termID, indri::index::TermData* termData, WriteBuffer* file );
00303 int _compressTermData( char* buffer, int size, indri::index::TermData* termData );
00304 void _decompressTermData( const char* buffer, int size, indri::index::TermData* termData );
00305
00306 void _writeParameters( const std::string& fileName );
00307 bool _readParameters( const std::string& fileName );
00308
00309
00310 void _openDocumentFiles();
00311 std::string _buildFileName( const char* suffix );
00312 std::string _buildFileName( const char* suffix, int index );
00313
00314
00315 indri::index::CorpusStatistics _corpusStatistics;
00316 std::vector<indri::index::FieldData*> _fieldData;
00317 std::map<const char*, int, less_string> _fieldLookup;
00318
00319
00320 std::string _baseName;
00321 bool _writingDocTermLists;
00322
00323
00324
00325
00326
00327 mutable Keyfile _termDataStore;
00328 mutable KeyfileWordMap _documentMap;
00329 mutable KeyfileWordMap _termMap;
00330 mutable File* _documentStatisticsFile;
00331 mutable File _documentLengthFile;
00332
00333 std::vector<File*> _segments;
00334 int _batchSegmentCount;
00335 File* _documentTermLocationsFile;
00336
00337
00338
00339
00340
00341 indri::index::TermListBuilder _termList;
00342 Buffer _termListBuffer;
00343
00344 greedy_vector<indri::index::TermData*> _seenTerms;
00345
00346
00347 HashTable<int, indri::index::TermData*>* _termDataTable;
00348 HashTable<const char*, term_cache_entry*>* _cache;
00349
00350 ReadBuffer* _documentStatisticsBuffer;
00351 ReadBuffer* _documentLengthBuffer;
00352
00353
00354 size_t _listsSize;
00355 size_t _memorySize;
00356 size_t _termDataSize;
00357 size_t _termCacheSize;
00358 size_t _statisticsBufferSize;
00359 size_t _lengthBufferSize;
00360 float _queryProportion;
00361 bool _batchBuild;
00362
00363 INT64 _estimatePoint;
00364 INT64 _lastCacheFlush;
00365 };
00366
00367
00368 #endif // INDRI_INDEX_HPP
00369