Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

File.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // File.hpp
00015 //
00016 // 1 December 2003 - tds
00017 //
00018 
00019 #ifndef LEMUR_FILE_HPP
00020 #define LEMUR_FILE_HPP
00021 
00022 #include <vector>
00023 #include <sstream>
00024 #include <fstream>
00025 #include <cassert>
00026 
00027 //#define FILE_PAGE_SIZE              (4096)
00028 #define FILE_PAGE_SIZE              (8192)
00029 #define FILE_1GB                    (1024 * 1024 * 1024)
00030 #define FILE_2GB_MINUS_1            (FILE_1GB + (FILE_1GB - 1))
00031 #define FILE_MAXIMUM_SEGMENT_SIZE   (FILE_2GB_MINUS_1 - FILE_PAGE_SIZE + 1)
00035 class File {
00036 public:
00037 #ifdef WIN32
00038 
00039   typedef __int64 offset_type;
00041   typedef int library_offset_type;
00042   #define _UNLINK ::unlink
00043 #else
00044 
00045   typedef off_t offset_type;
00047   typedef off_t library_offset_type;
00048   #define _UNLINK std::remove
00049 #endif
00050 
00051 
00052 private:
00053   struct FileSegment {
00054     std::fstream stream;
00055     offset_type start;
00056     offset_type end;
00057 
00058     bool contains( offset_type position ) {
00059       return start <= position && end > position;
00060     }
00061 
00062     bool before( offset_type position ) {
00063       return end <= position;
00064     }
00065   };
00066 
00067   std::string _fileName;
00068   std::vector<FileSegment*> _segments;
00069 
00070   FileSegment* _readSegment;
00071   FileSegment* _writeSegment;
00072   offset_type _readPosition; // seekg, read
00073   offset_type _writePosition; // seekp, write
00074   offset_type _readCount;
00075   bool _readPointerValid;
00076   bool _writePointerValid;
00077   int _mode;
00078   int _state;
00079 
00080   static std::string segmentName( const std::string& fileName, int segment ) {
00081     std::stringstream segName;
00082     segName << fileName << "$" << segment;
00083     return segName.str();
00084   }
00085 
00086   void _appendSegment() {
00087     FileSegment* segment = new FileSegment();
00088     int number = (int)_segments.size();
00089     std::string name = segmentName( _fileName, number );
00090 
00091     segment->start = size();
00092     segment->end = size();
00093     segment->stream.open( name.c_str(), std::ios::out | std::ios::binary );
00094 
00095     _segments.push_back( segment );
00096   }
00097 
00098   offset_type _absolutePosition( offset_type relativePosition,
00099     offset_type currentPosition,
00100     std::fstream::seekdir direction ) const {
00101     offset_type newPosition;
00102 
00103     switch( direction ) {
00104       case std::fstream::beg:
00105         newPosition = relativePosition;
00106         break;
00107 
00108       case std::fstream::cur:
00109         newPosition = currentPosition + relativePosition;
00110         break;
00111 
00112       case std::fstream::end:
00113         newPosition = size() + relativePosition;
00114         break;
00115     }
00116     // seeking off the end of a file is not currently implemented
00117     //    assert( newPosition <= size() );
00118 
00119     return newPosition;
00120   }
00121 
00122   FileSegment* _segmentForPosition( offset_type absolutePosition, 
00123                                     FileSegment* guess ) {
00124     //    assert( absolutePosition <= size() );
00125     //    assert( absolutePosition >= 0 );
00126     //    assert( _segments.size() );
00127 
00128     // see if the guess was good
00129     if( guess->contains( absolutePosition ) ) {
00130       return guess;
00131     }
00132 
00133     std::vector<FileSegment*>::iterator low = _segments.begin();
00134     std::vector<FileSegment*>::iterator high = _segments.end() - 1;
00135     std::vector<FileSegment*>::iterator middle;
00136 
00137     // if the user wants a position equal to or after the end of the file, 
00138     // pick the final segment
00139     if( (*high)->end <= absolutePosition ) {
00140       return *high;
00141     }
00142 
00143     //    assert( _segments.size() > 1 );
00144 
00145     while( high - low > 1 ) {
00146       middle = low+((high-low)/2);
00147 
00148       if( (*middle)->before(absolutePosition) ) {
00149         low = middle;
00150       } else {
00151         high = middle;
00152       }
00153     }
00154 
00155     if( (*high)->contains(absolutePosition) )
00156       return *high;
00157     else
00158       return *low;
00159   }
00160 
00161   void _validateReadPointer() {
00162     if( !_readPointerValid ) {
00163       _readSegment = _segmentForPosition( _readPosition, _readSegment );
00164       library_offset_type toHere = _readPosition - _readSegment->start;
00165       _readSegment->stream.seekg( toHere , std::ios::beg );
00166       _readPointerValid = true;
00167       _writePointerValid = false;
00168     }
00169 
00170     //    assert( _readPosition <= _readSegment->end );
00171     //    assert( _readPosition >= _readSegment->start );
00172     //    assert( _readPosition == ( library_offset_type(_readSegment->stream.tellg()) + _readSegment->start) );
00173   }
00174 
00175   void _validateWritePointer() {
00176     if( !_writePointerValid ) {
00177       _writeSegment = _segmentForPosition( _writePosition, _writeSegment );
00178       library_offset_type toHere = _writePosition - _writeSegment->start;
00179       _writeSegment->stream.seekp( toHere , std::ios::beg );
00180       _writePointerValid = true;
00181       _readPointerValid = false;
00182     }
00183 
00184     //    assert( _writePosition <= _writeSegment->end );
00185     //    assert( _writePosition >= _writeSegment->start );    
00186     //    assert( _writePosition == ( library_offset_type(_writeSegment->stream.tellp()) + _writeSegment->start) );
00187   }
00188 
00189 public:
00190   File() {
00191   }
00192 
00193   ~File() {
00194     close();
00195   }
00198   void open( const std::string& fileName, int mode ) {
00199     close();
00200     
00201     FileSegment* segment;
00202     _readPosition = 0;
00203     _writePosition = 0;
00204     _readCount = 0;
00205 
00206     _readPointerValid = false;
00207     _writePointerValid = false;
00208 
00209     _mode = mode | std::fstream::binary; // must be in binary mode
00210     _state = std::fstream::goodbit;
00211     _fileName = fileName;
00212 
00213     // open all existing segments
00214 
00215     for( int i=0; ; i++ ) {
00216       segment = new FileSegment();
00217       std::string name = segmentName( _fileName, i );
00218 
00219       // we only care about a few mode bits, the rest are taken 
00220       // opening for read only, so that we can tell if the file 
00221       // exists or not (don't want to 
00222       // open files that don't exist)
00223 
00224       segment->stream.open( name.c_str(), 
00225                             std::ifstream::in | std::ifstream::binary );
00226 
00227       if( segment->stream.rdstate() & std::fstream::failbit ) {
00228         // segment didn't open, so assume that we're done opening
00229         delete segment;
00230         break;
00231       } else if ( mode & std::fstream::trunc ) {
00232         // segment opened properly, but we're doing a trunc, so 
00233                 // this segment must go away
00234         segment->stream.close();
00235         delete segment;
00236                 // g++ doesn't like this -- dmf
00237                 //        ::unlink( name.c_str() );
00238         //std::remove( name.c_str() );
00239                 _UNLINK(name.c_str());
00240       } else {
00241         // segment opened properly and we'd like to keep it
00242         
00243         // if we want write access, we need to close and reopen the file
00244         if( mode & std::fstream::out ) {
00245           segment->stream.close();
00246           segment->stream.open( name.c_str(), 
00247                   // g++ doesn't like this -- dmf
00248                   // need to move this ifdef out of here
00249                   #ifdef WIN32 
00250                         mode & (std::fstream::binary | 
00251                         std::fstream::in | std::fstream::out) );
00252                   #else 
00253                         std::_Ios_Openmode(mode) & 
00254                         (std::fstream::binary | std::fstream::in | 
00255                          std::fstream::out) );
00256                   #endif
00257           if( segment->stream.rdstate() & std::fstream::failbit ) {
00258             delete segment;
00259             break;
00260           }
00261         }
00262 
00263         // set up segment statistics and add to the segment vector
00264         offset_type length;
00265 
00266         //        segment->stream.seekp( 0, std::fstream::end );
00267         //length = segment->stream.tellp();
00268         segment->stream.seekg( 0, std::fstream::end );
00269         length = segment->stream.tellg();
00270 
00271         segment->start = _readPosition;
00272         segment->end = _readPosition + length;
00273         _readPosition += length;
00274 
00275         _segments.push_back( segment );
00276       }
00277     }
00278 
00279     // if no segments exist but we plan to write something, make a segment
00280     if( _segments.size() == 0 && (mode & std::fstream::out) ) {
00281       _appendSegment();
00282     }
00283 
00284     // if there are still no segments, quit and fail, otherwise finish setup
00285     if( _segments.size() == 0 ) {
00286       _state |= std::fstream::failbit;
00287     } else {
00288       // initialize these to something reasonable
00289       _readPosition = 0;
00290       _writePosition = 0;
00291       _readSegment = _segments[0];
00292       _writeSegment = _segments[0];
00293 
00294       // seek to the end of the file if necessary for certain flags
00295       // otherwise set up the pointers at file begin
00296       if( mode & (std::fstream::ate|std::fstream::app) ) {
00297         seekg( 0, std::fstream::end );
00298         seekp( 0, std::fstream::end );
00299       } else {
00300         seekg( 0, std::fstream::beg );
00301         seekp( 0, std::fstream::beg );
00302       }
00303     }
00304   }
00305 
00307   void close() {
00308     std::vector<FileSegment*>::iterator iter;
00309 
00310     for( iter = _segments.begin(); iter != _segments.end(); iter++ ) {
00311       (*iter)->stream.close();
00312       delete (*iter);
00313     }
00314 
00315     _segments.clear();
00316   }
00318   void read( void* buffer, offset_type count ) {
00319     _validateReadPointer();
00320 
00321     offset_type readAmount;
00322     _readCount = 0;
00323 
00324     while( ((count - _readCount + _readPosition) >= _readSegment->end) &&
00325            _readPosition != size() ) {
00326       readAmount = _readSegment->end - _readPosition;
00327       _readSegment->stream.read( (char*)buffer + _readCount, library_offset_type(readAmount) );
00328       
00329       _readPosition += readAmount;
00330       _readCount += readAmount;
00331       _readPointerValid = false;
00332       _validateReadPointer();
00333     }
00334 
00335     if( _readPosition != size() ) {
00336       _readSegment->stream.read( (char*)buffer + _readCount, library_offset_type(count - _readCount) );
00337       readAmount = library_offset_type(_readSegment->stream.gcount());
00338       _readPosition += readAmount;
00339       _readCount += readAmount;
00340     }
00341 
00342     //    assert((int)_readSegment->stream.tellg() != -1 || size() == _readPosition );
00343   }
00345   void write( const void* buffer, offset_type count ) {
00346     offset_type bytesWritten = 0;
00347     offset_type writeAmount = 0;
00348     _validateWritePointer();
00349 
00350     while( ((count - bytesWritten) + (_writePosition - _writeSegment->start)) >= FILE_MAXIMUM_SEGMENT_SIZE ) {
00351       writeAmount = FILE_MAXIMUM_SEGMENT_SIZE - (_writePosition - _writeSegment->start);
00352   
00353       _writeSegment->stream.write( (const char*)buffer + bytesWritten, library_offset_type(writeAmount) );
00354       _writePosition += writeAmount;
00355       bytesWritten += writeAmount;
00356 
00357       if( _writePosition > _writeSegment->end ) {
00358         _writeSegment->end = _writePosition;
00359       }
00360 
00361       _appendSegment();
00362       _writePointerValid = false;
00363       _validateWritePointer();
00364     }
00365     
00366     writeAmount = count - bytesWritten;
00367     _writeSegment->stream.write( (const char*)buffer + bytesWritten, library_offset_type(writeAmount) );
00368     _writePosition += writeAmount;
00369     bytesWritten += writeAmount;
00370 
00371     if( _writePosition > _writeSegment->end ) {
00372       _writeSegment->end = _writePosition;
00373     }
00374 
00375     //    assert( (int)_writeSegment->stream.tellp() != -1 );
00376   }
00378   void seekg( offset_type relativePosition, std::fstream::seekdir direction ) {
00379     _readPosition = _absolutePosition( relativePosition, _readPosition, direction );
00380     _readPointerValid = false;
00381   }
00383   void seekp( offset_type relativePosition, std::fstream::seekdir direction ) {
00384     _writePosition = _absolutePosition( relativePosition, _writePosition, direction );
00385     _writePointerValid = false;
00386   }
00388   offset_type tellg() {
00389     return _readPosition;
00390   }
00392   offset_type tellp() {
00393     return _writePosition;
00394   }
00396   offset_type gcount() {
00397     return _readCount;
00398   }
00399 
00401   int rdstate() {
00402     if( size() == _readPosition )
00403       return _state | std::fstream::eofbit;
00404     else
00405       return _state;
00406   }
00408   offset_type size() const {
00409     if( _segments.size() == 0 ) {
00410       return 0;
00411     } else {
00412       return _segments[ _segments.size()-1 ]->end;
00413     }
00414   }
00416   void unlink() {
00417     close();
00418     File::unlink( _fileName );
00419     _fileName = "";
00420   }
00421 
00423   static void unlink( const std::string& fileName ) {
00424     for( int i=0; ; i++ ) {
00425       std::string segment = segmentName( fileName, i );
00426           if (_UNLINK( segment.c_str() ) != 0 ) {
00427       //if( ::unlink( segment.c_str() ) != 0 ) {
00428       //if( std::remove( segment.c_str() ) != 0 ) {
00429         break;
00430       }
00431     }
00432   }
00434   static void rename( const std::string& oldName, const std::string& newName ) {
00435     for( int i=0; ; i++ ) {
00436       std::string oldSegment = segmentName( oldName, i );
00437       std::string newSegment = segmentName( newName, i );
00438 
00439       if( ::rename( oldSegment.c_str(), newSegment.c_str() ) != 0 ) {
00440         break;
00441       }
00442     }
00443   }
00444 };
00445 
00446 #endif // LEMUR_FILE_HPP

Generated on Fri Feb 6 07:11:46 2004 for LEMUR by doxygen1.2.16