Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

DocListMemoryBuilder.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // DocListMemoryBuilder.hpp
00015 //
00016 // tds - 17 December 2003
00017 //
00018 
00019 #ifndef LEMUR_KEYFILEDOCLISTMEMORYBUILDER_HPP
00020 #define LEMUR_KEYFILEDOCLISTMEMORYBUILDER_HPP
00021 
00022 #include "RVLCompress.hpp"
00023 #include <vector>
00024 #include <assert.h>
00025 #include "indri/greedy_vector"
00026 
00027 namespace indri {
00028   namespace index {
00029     class DocListMemoryBuilderIterator {
00030       const greedy_vector< std::pair<char*, char*>, 4 >& _lists;
00031       greedy_vector< std::pair<char*, char*>, 4 >::const_iterator _current;
00032 
00033       const char* _list;
00034       const char* _listEnd;
00035 
00036       int _currentDocument;
00037       int _currentPosition;
00038       int _positionsLeft;
00039 
00040     public:
00041       DocListMemoryBuilderIterator( const greedy_vector< std::pair<char*,char*>, 4 >& lists ) :
00042         _lists(lists)
00043       {
00044         _current = _lists.begin();
00045         _currentDocument = 0;
00046         _currentPosition = 0;
00047         _positionsLeft = 0;
00048         _list = 0;
00049         _listEnd = 0;
00050 
00051         if( _current != _lists.end() ) {
00052           _list = _current->first;
00053           _listEnd = _current->second;
00054         }
00055       }
00056 
00057       bool next() {
00058         if( _list < _listEnd ) {
00059           if( _positionsLeft > 0 ) {
00060             // we have more positions left in the current document
00061             int deltaPosition;
00062             _list = RVLCompress::decompress_int( _list, deltaPosition );
00063             _currentPosition += deltaPosition;
00064             _positionsLeft--;
00065             return true;
00066           } else {
00067             // no positions left, but we have more documents to read
00068             int deltaDocument;
00069             _list = RVLCompress::decompress_int( _list, deltaDocument );
00070             _list = RVLCompress::decompress_int( _list, _positionsLeft );
00071             _list = RVLCompress::decompress_int( _list, _currentPosition );
00072             _currentDocument += deltaDocument;
00073             _positionsLeft--;
00074             return true;
00075           }
00076         } else {    
00077           assert( _list == _listEnd );
00078 
00079           // no data left, go to the next segment
00080           if( _current != _lists.end() )
00081             _current++;
00082           
00083           if( _current != _lists.end() ) {
00084             _list = _current->first;
00085             _listEnd = _current->second;
00086             return next();
00087           }
00088 
00089           // no more list segments
00090           return false;
00091         }
00092       }
00093 
00094       int document() {
00095         return _currentDocument;
00096       }
00097 
00098       int position() {
00099         return _currentPosition;
00100       }
00101     };
00102 
00103     class DocListMemoryBuilder {
00104     public:
00105       typedef DocListMemoryBuilderIterator iterator;
00106 
00107     private:
00108       int _documentFrequency;
00109       int _termFrequency;
00110 
00111       greedy_vector< std::pair<char*,char*>, 4 > _lists;
00112 
00113       char* _list;
00114       char* _listBegin;
00115       char* _listEnd;
00116       char* _locationCountPointer;
00117 
00118       int _lastLocation;
00119       int _lastDocument;
00120       int _lastTermFrequency;
00121 
00122       void _storeCompressedInt( std::vector<char>& destination, int data, int previous = 0 );
00123       void _createDocument( int docID );
00124       void _writeLocation( int location );
00125       void _terminateDocument();
00126       void _terminateSegment();
00127       void _grow();
00128       void _copy( DocListMemoryBuilder& other );
00129 
00130     public:
00131       DocListMemoryBuilder();
00132       const DocListMemoryBuilder& operator=( DocListMemoryBuilder& other );
00133       void addLocation( int docID, int location );
00134       void clear();
00135       void close();
00136       iterator getIterator();
00137       bool empty();
00138 
00139       int documentFrequency() const;
00140       int termFrequency() const;
00141       size_t memorySize() const;
00142       int curDocID() const;
00143     };
00144   }
00145 }
00146 
00147 #endif // LEMUR_DOCLISTMEMORYBUILDER_HPP
00148 

Generated on Wed Nov 3 12:58:54 2004 for Lemur Toolkit by doxygen1.2.18