DocListDiskBlockReader.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // DocListDiskBlockReader
00015 //
00016 // 9 January 2004 - tds
00017 //
00018 
00019 #ifndef LEMUR_KEYFILEDOCLISTDISKBLOCKREADER_HPP
00020 #define LEMUR_KEYFILEDOCLISTDISKBLOCKREADER_HPP
00021 
00022 #define INDRI_DOCLIST_BLOCKSIZE (8192)
00023 #include "RVLCompress.hpp"
00024 #include "indri/DocListInfo.hpp"
00025 #include "indri/DocumentCount.hpp"
00026 
00027 namespace indri {
00028   namespace index {
00029 
00030     class DocListDiskBlockReader {
00031     private:
00032       const char* _block;
00033 
00034       const char* _list;
00035       const char* _listEnd;
00036       int _termIndex;
00037       INT16 _termCount;
00038 
00039       bool _finished;
00040       INT32 _termID;
00041 
00042       int _lastTerm;
00043       int _firstTerm;
00044       int _lastDocument;
00045 
00046       struct DocumentData {
00047         int documentID;
00048         int positionCount;
00049         int positions[ INDRI_DOCLIST_BLOCKSIZE ];
00050       } _documentData;
00051 
00052     public:
00053       DocListDiskBlockReader() : _finished(true) {
00054         setBlock(0);
00055       }
00056 
00057       static int blockSize() {
00058         return INDRI_DOCLIST_BLOCKSIZE;
00059       }
00060 
00061       void setBlock( const char* block ) {
00062         _block = block;
00063 
00064         _termCount = _block ? *(short*) _block : 0;
00065         
00066         _termID = 0;
00067         _list = _block + sizeof(short);
00068         _listEnd = _list;
00069         _documentData.documentID = 0;
00070         _documentData.positionCount = 0;
00071 
00072         const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00073 
00074         _lastTerm = (_termCount > 0) ? * (int*) (endBlock - sizeof(int)) : 0;
00075         _firstTerm = (_termCount > 0) ? * (int*) (endBlock - _termCount*sizeof(int)) : 0;
00076         _lastDocument = 0;
00077 
00078         if( _termCount > 0 )
00079           memcpy( &_lastDocument, endBlock - _termCount*(sizeof(int)+sizeof(short)) - sizeof(int), sizeof(int) );
00080 
00081         _termIndex = -1;
00082         _finished = (_termCount == 0);
00083 
00084         assert( _validateBlock() );
00085       }
00086 
00087       int lastTerm() {
00088         return _lastTerm;
00089       }
00090 
00091       int firstTerm() {
00092         return _firstTerm;
00093       }
00094 
00095       int lastDocument() {
00096         return _lastDocument;
00097       }
00098 
00099       bool hasMore() {
00100         return !( (_list == _listEnd) && (_termIndex + 1) == _termCount );
00101       }
00102 
00103     private:
00104       bool _trashDocument() {
00105         _documentData.documentID = MAX_INT32;
00106         _documentData.positionCount = MAX_INT32;
00107         _documentData.positions[0] = MAX_INT32;
00108         return true;
00109       }
00110 
00111       bool _validateBlock() {
00112         if( ! _block )
00113           return true;
00114 
00115         // validate termCount
00116         short tc = * (short*) _block;
00117         assert( tc > 0 );
00118         assert( tc < INDRI_DOCLIST_BLOCKSIZE / sizeof(int) );
00119 
00120         const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00121         UINT32* termIDs = ( (UINT32*) (_block + INDRI_DOCLIST_BLOCKSIZE) ) - _termCount;
00122         UINT16* offsets = ( (UINT16*) termIDs ) - _termCount;
00123         UINT32* lastDoc = ( (UINT32*) offsets ) - 1;
00124         char* beginMetaData = (char*) lastDoc;
00125         size_t maxDataLength = beginMetaData - _block;
00126 
00127         // validate offsets
00128         for( int i=0; i<_termCount; i++ ) {
00129           assert( offsets[i] > sizeof(short) );
00130           assert( offsets[i] <= maxDataLength );
00131           if( i>0 )
00132             assert( offsets[i] > offsets[i-1] );
00133         }
00134 
00135         // validate termIDs
00136         for( int i=0; i<_termCount; i++ ) {
00137           assert( termIDs[i] >= 0 );
00138           if( i>0 )
00139             assert( termIDs[i] > termIDs[i-1] );
00140         }
00141 
00142         // validate lastDoc
00143         assert( lastDocument() > 0 );
00144         return true;
00145       }
00146 
00147       void _skipDocumentLocations() {
00148         _list = RVLCompress::skip_ints( _list, _documentData.positionCount );
00149       }
00150       
00151       void _fetchDocumentHeader() {
00152         assert( _list < _listEnd );
00153         int lastDocument = _documentData.documentID;
00154         _list = RVLCompress::decompress_int( _list, _documentData.documentID );
00155         _list = RVLCompress::decompress_int( _list, _documentData.positionCount );
00156         _documentData.documentID += lastDocument;
00157         assert( _list + _documentData.positionCount <= _listEnd );
00158         assert( _documentData.documentID > 0 );
00159         assert( _documentData.positionCount >= 0 );
00160       }
00161 
00162       void _fetchDocumentPositions() {
00163         assert( _list < _listEnd );
00164         _list = RVLCompress::decompress_int_count( _list, _documentData.positions, _documentData.positionCount );
00165 
00166         for( int i=1; i<_documentData.positionCount; i++ ) {
00167           _documentData.positions[i] += _documentData.positions[i-1];
00168         }
00169         assert( _list <= _listEnd );
00170       }
00171 
00172       void _fetchDocument() {
00173         _fetchDocumentHeader();
00174         _fetchDocumentPositions();
00175       }
00176 
00177       void _skipToTermIndex( int index ) {
00178         assert( index < _termCount );
00179         _termIndex = index;
00180 
00181         const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00182         const char* endEndings = endBlock - _termCount*sizeof(INT32);
00183         const char* beginEndings = endBlock - _termCount*(sizeof(INT32)+sizeof(INT16));
00184         INT32* termIDs = (INT32*) endEndings;
00185         INT16* offsets = (INT16*) beginEndings;
00186 
00187         short endOffset = offsets[ _termIndex ];
00188         short startOffset;
00189         if( _termIndex == 0 ) {
00190           startOffset = sizeof(short);
00191         } else {
00192           startOffset = offsets[ _termIndex - 1 ];
00193         }
00194 
00195         _termID = termIDs[ _termIndex ];
00196         _list = _block + startOffset;
00197         _listEnd = _block + endOffset;
00198 
00199         _documentData.documentID = 0;
00200         _documentData.positionCount = 0;
00201         
00202         // validate information
00203         assert( _list <= _listEnd );
00204         assert( startOffset > 0 && startOffset < INDRI_DOCLIST_BLOCKSIZE );
00205         assert( endOffset > startOffset && endOffset < INDRI_DOCLIST_BLOCKSIZE );
00206         assert( _termID <= lastTerm() );
00207         assert( _termID >= firstTerm() );
00208       }
00209 
00210       bool _skipToTerm( int termID ) {
00211         assert( _termID < termID );
00212 
00213         if( termID > lastTerm() ) {
00214           assert( _trashDocument() );
00215           _termIndex = _termCount;
00216           _finished = true;
00217           return true;
00218         }
00219 
00220         // find termID array
00221         const char* endBlock = _block + INDRI_DOCLIST_BLOCKSIZE;
00222         const char* beginTermIDs = endBlock - _termCount*sizeof(UINT32);
00223         UINT32* termIDs = (UINT32*) beginTermIDs;
00224         UINT32* start = termIDs;
00225         UINT32* finish = termIDs + _termCount - 1;
00226 
00227         // binary search for the right location  
00228         while( (finish - start) >= 2 ) {
00229           UINT32* middle = start + (finish-start)/2;
00230 
00231           if( *middle > unsigned(termID) )
00232             finish = middle;
00233           else
00234             start = middle;
00235         }
00236         
00237         UINT32* termLocation; 
00238 
00239         if( *start >= unsigned(termID) )
00240           termLocation = start;
00241         else
00242           termLocation = finish;
00243 
00244         // the term isn't here!
00245         if( *termLocation != unsigned(termID) )
00246           return false;
00247         
00248         // now, reset everything
00249         assert( (termLocation - termIDs) <= _termCount );
00250         assert( *termLocation == unsigned(termID) );
00251 
00252         _skipToTermIndex( int(termLocation - termIDs) );
00253         return true;
00254       }
00255 
00256     public:
00257       // Returns true if the requested (termID, documentID) pair
00258       // should be in a block after this one.  This 
00259       // method is used for skipping--if this returns false,
00260       // you can safely skip to the next block in search of this
00261       // pair.
00262       bool before( int termID, int documentID ) {
00263         int last = lastTerm();
00264         int lastDoc = lastDocument();
00265 
00266         // the first termID in this block is after this one
00267         // so we're definitely behind
00268         if( last < termID )
00269           return true;
00270 
00271         // the last termID in this block is this termID,
00272         // but the last documentID is smaller than the one
00273         // we want, so we're still smaller
00274         if( last == termID && lastDoc < documentID )
00275           return true;
00276 
00277         assert( last > termID || (termID == last && lastDoc >= documentID) );
00278         return false;
00279       }
00280 
00281       // Returns true if the requested (termID, documentID) pair
00282       // should be in this block.  This 
00283       // method is used for skipping--if this returns false,
00284       // you can safely skip to the next block in search of this
00285       // pair.
00286       bool contains( int termID, int documentID ) {
00287         int last = lastTerm();
00288         int first = firstTerm();
00289         
00290         // the last term is greater than the ID
00291         // we're searching for, so the document
00292         // definitely should be here
00293         if( termID != last )
00294           return termID >= first && termID <= last;
00295 
00296         return lastDocument() >= documentID;
00297       }
00298 
00299       // attempt to skip to (termID, documentID)
00300       // or to a document greater that documentID
00301       // but that still shares the same termID.
00302       // return true if successful.
00303       bool skip( int termID, int documentID ) {
00304         // attempt to skip to this term
00305         if( _termID < termID ) {
00306           if( !_skipToTerm( termID ) ) {
00307             return false;
00308           } else if( _list == _listEnd ) {
00309             // skip to term may have run us off
00310             // the end of the block, in that
00311             // case, return false without fetching
00312             return false;
00313           } else {
00314             // we moved forward to get here, so
00315             // we'd better fetch a document
00316             _fetchDocument();
00317           }
00318         } else if( _list == _listEnd ) {
00319           // if we're out of documents, or if we've
00320           // come to the next term, tell the caller
00321           // we were unsuccessful
00322           assert( _trashDocument() );
00323           return false;
00324         }
00325 
00326         // we have a good document, so return success
00327         if( _documentData.documentID >= documentID )
00328           return true;
00329 
00330         // keep looking! it has to be here somewhere!
00331         while( _list != _listEnd ) {
00332           _fetchDocumentHeader();
00333 
00334           if( _documentData.documentID >= documentID ) {
00335             _fetchDocumentPositions();
00336             return true;
00337           }
00338 
00339           _skipDocumentLocations();
00340         }
00341 
00342         // oh, well. no luck
00343         assert( _trashDocument() );
00344         return false;
00345       }
00346 
00347       void currentDocument( DocListInfo& info ) {
00348         assert( ! _finished );
00349         assert( _block != 0 );
00350 
00351         info.setTermID( _termID );
00352         info.setDocID( _documentData.documentID );
00353         // dmf FIXME
00354         info.addPositions((LOC_T*) _documentData.positions, _documentData.positionCount );
00355       }
00356 
00357       bool nextDocument() {
00358         if( _finished || _block == 0 ) {
00359           assert( _trashDocument() );
00360           _finished = true;
00361           return false;
00362         }
00363 
00364         if( _list == _listEnd ) {
00365           // go to the next term
00366           _termIndex++;
00367 
00368           if( _termIndex >= _termCount ) {
00369             assert( _trashDocument() );
00370             _finished = true;
00371             return false;
00372           }
00373 
00374           _skipToTermIndex( _termIndex );
00375         }
00376 
00377         _fetchDocument();
00378         return true;
00379       }
00380 
00381       bool readBlock( int termID, greedy_vector<DocumentCount>& entries ) {
00382         // skip to this term ID
00383         if( _termID != termID ) {
00384           if( !_skipToTerm(termID) ) {
00385             return false;
00386           }
00387 
00388           // we may have skipped to the end of the block;
00389           // in this case return true without fetching anything
00390           if( _list == _listEnd ) {
00391             return true;
00392           }
00393         }
00394 
00395         // check to see if we need to update the last entry
00396         DocumentCount pair;
00397         int lastDocument = 0;
00398         pair.document = 0;
00399         pair.count = 0;
00400 
00401         // fetch first document
00402         _list = RVLCompress::decompress_int( _list, pair.document );
00403         _list = RVLCompress::decompress_int( _list, pair.count );
00404 
00405         // special handling for the first document; it may be that the last
00406         // block had only part of the positions for this document
00407         if( entries.size() && entries.back().document == pair.document ) {
00408           entries.back().count += pair.count;
00409         } else {
00410           entries.push_back( pair );
00411         }
00412 
00413         _list = RVLCompress::skip_ints( _list, pair.count );
00414         lastDocument = pair.document;
00415 
00416         while( _list < _listEnd ) {
00417           _list = RVLCompress::decompress_int( _list, pair.document );
00418           _list = RVLCompress::decompress_int( _list, pair.count );
00419           _list = RVLCompress::skip_ints( _list, pair.count );
00420           pair.document += lastDocument;
00421           lastDocument = pair.document;
00422 
00423           entries.push_back( pair );
00424         }
00425 
00426         return true;
00427       }
00428 
00429       bool finished() {
00430         return _finished;
00431       }
00432 
00433       int document() const {
00434         return _documentData.documentID;
00435       }
00436 
00437       int term() const {
00438         return _termID;
00439       }
00440     };
00441   }
00442 }
00443 
00444 #endif // LEMUR_KEYFILEDOCLISTDISKBLOCKREADER_HPP
00445 
00446