00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef LEMUR_KEYFILEDOCLISTDISKBLOCKREADER_HPP
00020 #define LEMUR_KEYFILEDOCLISTDISKBLOCKREADER_HPP
00021
00022 #define INDRI_DOCLIST_BLOCKSIZE (8192)
00023 #include "RVLCompress.hpp"
00024 #include "indri/DocListInfo.hpp"
00025 #include "indri/DocumentCount.hpp"
00026
00027 namespace indri {
00028 namespace index {
00029
00030 class DocListDiskBlockReader {
00031 private:
00032 const char* _block;
00033
00034 const char* _list;
00035 const char* _listEnd;
00036 int _termIndex;
00037 INT16 _termCount;
00038
00039 bool _finished;
00040 INT32 _termID;
00041
00042 int _lastTerm;
00043 int _firstTerm;
00044 int _lastDocument;
00045
00046 struct DocumentData {
00047 int documentID;
00048 int positionCount;
00049 int positions[ INDRI_DOCLIST_BLOCKSIZE ];
00050 } _documentData;
00051
00052 public:
00053 DocListDiskBlockReader() : _finished(true) {
00054 setBlock(0);
00055 }
00056
00057 static int blockSize() {
00058 return INDRI_DOCLIST_BLOCKSIZE;
00059 }
00060
00061 void setBlock( const char* block ) {
00062 _block = block;
00063
00064 _termCount = _block ? *(short*) _block : 0;
00065
00066 _termID = 0;
00067 _list = _block + sizeof(short);
00068 _listEnd = _list;
00069 _documentData.documentID = 0;
00070 _documentData.positionCount = 0;
00071
00072 const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00073
00074 _lastTerm = (_termCount > 0) ? * (int*) (endBlock - sizeof(int)) : 0;
00075 _firstTerm = (_termCount > 0) ? * (int*) (endBlock - _termCount*sizeof(int)) : 0;
00076 _lastDocument = 0;
00077
00078 if( _termCount > 0 )
00079 memcpy( &_lastDocument, endBlock - _termCount*(sizeof(int)+sizeof(short)) - sizeof(int), sizeof(int) );
00080
00081 _termIndex = -1;
00082 _finished = (_termCount == 0);
00083
00084 assert( _validateBlock() );
00085 }
00086
00087 int lastTerm() {
00088 return _lastTerm;
00089 }
00090
00091 int firstTerm() {
00092 return _firstTerm;
00093 }
00094
00095 int lastDocument() {
00096 return _lastDocument;
00097 }
00098
00099 bool hasMore() {
00100 return !( (_list == _listEnd) && (_termIndex + 1) == _termCount );
00101 }
00102
00103 private:
00104 bool _trashDocument() {
00105 _documentData.documentID = MAX_INT32;
00106 _documentData.positionCount = MAX_INT32;
00107 _documentData.positions[0] = MAX_INT32;
00108 return true;
00109 }
00110
00111 bool _validateBlock() {
00112 if( ! _block )
00113 return true;
00114
00115
00116 short tc = * (short*) _block;
00117 assert( tc > 0 );
00118 assert( tc < INDRI_DOCLIST_BLOCKSIZE / sizeof(int) );
00119
00120 const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00121 UINT32* termIDs = ( (UINT32*) (_block + INDRI_DOCLIST_BLOCKSIZE) ) - _termCount;
00122 UINT16* offsets = ( (UINT16*) termIDs ) - _termCount;
00123 UINT32* lastDoc = ( (UINT32*) offsets ) - 1;
00124 char* beginMetaData = (char*) lastDoc;
00125 size_t maxDataLength = beginMetaData - _block;
00126
00127
00128 for( int i=0; i<_termCount; i++ ) {
00129 assert( offsets[i] > sizeof(short) );
00130 assert( offsets[i] <= maxDataLength );
00131 if( i>0 )
00132 assert( offsets[i] > offsets[i-1] );
00133 }
00134
00135
00136 for( int i=0; i<_termCount; i++ ) {
00137 assert( termIDs[i] >= 0 );
00138 if( i>0 )
00139 assert( termIDs[i] > termIDs[i-1] );
00140 }
00141
00142
00143 assert( lastDocument() > 0 );
00144 return true;
00145 }
00146
00147 void _skipDocumentLocations() {
00148 _list = RVLCompress::skip_ints( _list, _documentData.positionCount );
00149 }
00150
00151 void _fetchDocumentHeader() {
00152 assert( _list < _listEnd );
00153 int lastDocument = _documentData.documentID;
00154 _list = RVLCompress::decompress_int( _list, _documentData.documentID );
00155 _list = RVLCompress::decompress_int( _list, _documentData.positionCount );
00156 _documentData.documentID += lastDocument;
00157 assert( _list + _documentData.positionCount <= _listEnd );
00158 assert( _documentData.documentID > 0 );
00159 assert( _documentData.positionCount >= 0 );
00160 }
00161
00162 void _fetchDocumentPositions() {
00163 assert( _list < _listEnd );
00164 _list = RVLCompress::decompress_int_count( _list, _documentData.positions, _documentData.positionCount );
00165
00166 for( int i=1; i<_documentData.positionCount; i++ ) {
00167 _documentData.positions[i] += _documentData.positions[i-1];
00168 }
00169 assert( _list <= _listEnd );
00170 }
00171
00172 void _fetchDocument() {
00173 _fetchDocumentHeader();
00174 _fetchDocumentPositions();
00175 }
00176
00177 void _skipToTermIndex( int index ) {
00178 assert( index < _termCount );
00179 _termIndex = index;
00180
00181 const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00182 const char* endEndings = endBlock - _termCount*sizeof(INT32);
00183 const char* beginEndings = endBlock - _termCount*(sizeof(INT32)+sizeof(INT16));
00184 INT32* termIDs = (INT32*) endEndings;
00185 INT16* offsets = (INT16*) beginEndings;
00186
00187 short endOffset = offsets[ _termIndex ];
00188 short startOffset;
00189 if( _termIndex == 0 ) {
00190 startOffset = sizeof(short);
00191 } else {
00192 startOffset = offsets[ _termIndex - 1 ];
00193 }
00194
00195 _termID = termIDs[ _termIndex ];
00196 _list = _block + startOffset;
00197 _listEnd = _block + endOffset;
00198
00199 _documentData.documentID = 0;
00200 _documentData.positionCount = 0;
00201
00202
00203 assert( _list <= _listEnd );
00204 assert( startOffset > 0 && startOffset < INDRI_DOCLIST_BLOCKSIZE );
00205 assert( endOffset > startOffset && endOffset < INDRI_DOCLIST_BLOCKSIZE );
00206 assert( _termID <= lastTerm() );
00207 assert( _termID >= firstTerm() );
00208 }
00209
00210 bool _skipToTerm( int termID ) {
00211 assert( _termID < termID );
00212
00213 if( termID > lastTerm() ) {
00214 assert( _trashDocument() );
00215 _termIndex = _termCount;
00216 _finished = true;
00217 return true;
00218 }
00219
00220
00221 const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00222 const char* beginTermIDs = endBlock - _termCount*sizeof(UINT32);
00223 UINT32* termIDs = (UINT32*) beginTermIDs;
00224 UINT32* start = termIDs;
00225 UINT32* finish = termIDs + _termCount - 1;
00226
00227
00228 while( (finish - start) >= 2 ) {
00229 UINT32* middle = start + (finish-start)/2;
00230
00231 if( *middle > unsigned(termID) )
00232 finish = middle;
00233 else
00234 start = middle;
00235 }
00236
00237 UINT32* termLocation;
00238
00239 if( *start >= unsigned(termID) )
00240 termLocation = start;
00241 else
00242 termLocation = finish;
00243
00244
00245 if( *termLocation != unsigned(termID) )
00246 return false;
00247
00248
00249 assert( (termLocation - termIDs) <= _termCount );
00250 assert( *termLocation == unsigned(termID) );
00251
00252 _skipToTermIndex( int(termLocation - termIDs) );
00253 return true;
00254 }
00255
00256 public:
00257
00258
00259
00260
00261
00262 bool before( int termID, int documentID ) {
00263 int last = lastTerm();
00264 int lastDoc = lastDocument();
00265
00266
00267
00268 if( last < termID )
00269 return true;
00270
00271
00272
00273
00274 if( last == termID && lastDoc < documentID )
00275 return true;
00276
00277 assert( last > termID || (termID == last && lastDoc >= documentID) );
00278 return false;
00279 }
00280
00281
00282
00283
00284
00285
00286 bool contains( int termID, int documentID ) {
00287 int last = lastTerm();
00288 int first = firstTerm();
00289
00290
00291
00292
00293 if( termID != last )
00294 return termID >= first && termID <= last;
00295
00296 return lastDocument() >= documentID;
00297 }
00298
00299
00300
00301
00302
00303 bool skip( int termID, int documentID ) {
00304
00305 if( _termID < termID ) {
00306 if( !_skipToTerm( termID ) ) {
00307 return false;
00308 } else if( _list == _listEnd ) {
00309
00310
00311
00312 return false;
00313 } else {
00314
00315
00316 _fetchDocument();
00317 }
00318 } else if( _list == _listEnd ) {
00319
00320
00321
00322 assert( _trashDocument() );
00323 return false;
00324 }
00325
00326
00327 if( _documentData.documentID >= documentID )
00328 return true;
00329
00330
00331 while( _list != _listEnd ) {
00332 _fetchDocumentHeader();
00333
00334 if( _documentData.documentID >= documentID ) {
00335 _fetchDocumentPositions();
00336 return true;
00337 }
00338
00339 _skipDocumentLocations();
00340 }
00341
00342
00343 assert( _trashDocument() );
00344 return false;
00345 }
00346
00347 void currentDocument( DocListInfo& info ) {
00348 assert( ! _finished );
00349 assert( _block != 0 );
00350
00351 info.setTermID( _termID );
00352 info.setDocID( _documentData.documentID );
00353
00354 info.addPositions((LOC_T*) _documentData.positions, _documentData.positionCount );
00355 }
00356
00357 bool nextDocument() {
00358 if( _finished || _block == 0 ) {
00359 assert( _trashDocument() );
00360 _finished = true;
00361 return false;
00362 }
00363
00364 if( _list == _listEnd ) {
00365
00366 _termIndex++;
00367
00368 if( _termIndex >= _termCount ) {
00369 assert( _trashDocument() );
00370 _finished = true;
00371 return false;
00372 }
00373
00374 _skipToTermIndex( _termIndex );
00375 }
00376
00377 _fetchDocument();
00378 return true;
00379 }
00380
00381 bool readBlock( int termID, greedy_vector<DocumentCount>& entries ) {
00382
00383 if( _termID != termID ) {
00384 if( !_skipToTerm(termID) ) {
00385 return false;
00386 }
00387
00388
00389
00390 if( _list == _listEnd ) {
00391 return true;
00392 }
00393 }
00394
00395
00396 DocumentCount pair;
00397 int lastDocument = 0;
00398 pair.document = 0;
00399 pair.count = 0;
00400
00401
00402 _list = RVLCompress::decompress_int( _list, pair.document );
00403 _list = RVLCompress::decompress_int( _list, pair.count );
00404
00405
00406
00407 if( entries.size() && entries.back().document == pair.document ) {
00408 entries.back().count += pair.count;
00409 } else {
00410 entries.push_back( pair );
00411 }
00412
00413 _list = RVLCompress::skip_ints( _list, pair.count );
00414 lastDocument = pair.document;
00415
00416 while( _list < _listEnd ) {
00417 _list = RVLCompress::decompress_int( _list, pair.document );
00418 _list = RVLCompress::decompress_int( _list, pair.count );
00419 _list = RVLCompress::skip_ints( _list, pair.count );
00420 pair.document += lastDocument;
00421 lastDocument = pair.document;
00422
00423 entries.push_back( pair );
00424 }
00425
00426 return true;
00427 }
00428
00429 bool finished() {
00430 return _finished;
00431 }
00432
00433 int document() const {
00434 return _documentData.documentID;
00435 }
00436
00437 int term() const {
00438 return _termID;
00439 }
00440 };
00441 }
00442 }
00443
00444 #endif // LEMUR_KEYFILEDOCLISTDISKBLOCKREADER_HPP
00445
00446