Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

File.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // File.hpp
00015 //
00016 // 1 December 2003 - tds
00017 //
00018 
00019 #ifndef LEMUR_FILE_HPP
00020 #define LEMUR_FILE_HPP
00021 
00022 #include <vector>
00023 #include <sstream>
00024 #include <fstream>
00025 #include <cassert>
00026 
00027 //#define FILE_PAGE_SIZE              (4096)
00028 #define FILE_PAGE_SIZE              (8192)
00029 #define FILE_1GB                    (1024 * 1024 * 1024)
00030 #define FILE_2GB_MINUS_1            (FILE_1GB + (FILE_1GB - 1))
00031 #define FILE_MAXIMUM_SEGMENT_SIZE   (FILE_2GB_MINUS_1 - FILE_PAGE_SIZE + 1)
00035 class File {
00036 public:
00037 #ifdef WIN32
00038 
00039   typedef __int64 offset_type;
00041   typedef int library_offset_type;
00042   #define _UNLINK ::unlink
00043 #else
00044 
00045   typedef off_t offset_type;
00047   typedef off_t library_offset_type;
00048   #define _UNLINK std::remove
00049 #endif
00050 
00051 
00052 private:
00053   struct FileSegment {
00054     std::fstream stream;
00055     offset_type start;
00056     offset_type end;
00057 
00058     bool contains( offset_type position ) {
00059       return start <= position && end > position;
00060     }
00061 
00062     bool before( offset_type position ) {
00063       return end <= position;
00064     }
00065   };
00066 
00067   std::string _fileName;
00068   std::vector<FileSegment*> _segments;
00069 
00070   FileSegment* _readSegment;
00071   FileSegment* _writeSegment;
00072   offset_type _readPosition; // seekg, read
00073   offset_type _writePosition; // seekp, write
00074   offset_type _readCount;
00075   bool _readPointerValid;
00076   bool _writePointerValid;
00077   int _mode;
00078   int _state;
00079 
00080   static std::string segmentName( const std::string& fileName, int segment ) {
00081     std::stringstream segName;
00082     segName << fileName << "$" << segment;
00083     return segName.str();
00084   }
00085 
00086   void _appendSegment() {
00087     FileSegment* segment = new FileSegment();
00088     int number = (int)_segments.size();
00089     std::string name = segmentName( _fileName, number );
00090     segment->start = size();
00091     segment->end = size();
00092     segment->stream.open( name.c_str(), std::ios::out | std::ios::binary );
00093     
00094     segment->stream.close();
00095     segment->stream.open( name.c_str(), std::ios::out | std::ios::binary | std::ios::in );
00096     _segments.push_back( segment );
00097   }
00098   
00099 
00100   offset_type _absolutePosition( offset_type relativePosition,
00101     offset_type currentPosition,
00102     std::fstream::seekdir direction ) const {
00103     offset_type newPosition;
00104 
00105     switch( direction ) {
00106       case std::fstream::beg:
00107         newPosition = relativePosition;
00108         break;
00109 
00110       case std::fstream::cur:
00111         newPosition = currentPosition + relativePosition;
00112         break;
00113 
00114       case std::fstream::end:
00115         newPosition = size() + relativePosition;
00116         break;
00117     default:
00118       break;
00119     }
00120     // seeking off the end of a file is not currently implemented
00121         assert( newPosition <= size() );
00122 
00123     return newPosition;
00124   }
00125 
00126   FileSegment* _segmentForPosition( offset_type absolutePosition, 
00127                                     FileSegment* guess ) {
00128     assert( absolutePosition <= size() );
00129         assert( absolutePosition >= 0 );
00130         assert( _segments.size() );
00131 
00132     // see if the guess was good
00133     if( guess->contains( absolutePosition ) ) {
00134       return guess;
00135     }
00136 
00137     std::vector<FileSegment*>::iterator low = _segments.begin();
00138     std::vector<FileSegment*>::iterator high = _segments.end() - 1;
00139     std::vector<FileSegment*>::iterator middle;
00140 
00141     // if the user wants a position equal to or after the end of the file, 
00142     // pick the final segment
00143     if( (*high)->end <= absolutePosition ) {
00144       return *high;
00145     }
00146 
00147     assert( _segments.size() > 1 );
00148 
00149     while( high - low > 1 ) {
00150       middle = low+((high-low)/2);
00151 
00152       if( (*middle)->before(absolutePosition) ) {
00153         low = middle;
00154       } else {
00155         high = middle;
00156       }
00157     }
00158 
00159     if( (*high)->contains(absolutePosition) )
00160       return *high;
00161     else
00162       return *low;
00163   }
00164 
00165   void _validateReadPointer() {
00166     if( !_readPointerValid ) {
00167       _readSegment = _segmentForPosition( _readPosition, _readSegment );
00168       library_offset_type toHere = _readPosition - _readSegment->start;
00169       _readSegment->stream.seekg( toHere , std::ios::beg );
00170       _readPointerValid = true;
00171       _writePointerValid = false;
00172     }
00173 
00174         assert( _readPosition <= _readSegment->end );
00175         assert( _readPosition >= _readSegment->start );
00176         assert( _readPosition == ( library_offset_type(_readSegment->stream.tellg()) + _readSegment->start) );
00177   }
00178 
00179   void _validateWritePointer() {
00180     if( !_writePointerValid ) {
00181       _writeSegment = _segmentForPosition( _writePosition, _writeSegment );
00182       library_offset_type toHere = _writePosition - _writeSegment->start;
00183       _writeSegment->stream.seekp( toHere , std::ios::beg );
00184       _writePointerValid = true;
00185       _readPointerValid = false;
00186     }
00187 
00188         assert( _writePosition <= _writeSegment->end );
00189         assert( _writePosition >= _writeSegment->start );    
00190         assert( _writePosition == ( library_offset_type(_writeSegment->stream.tellp()) + _writeSegment->start) );
00191   }
00192 
00193 public:
00194   File() {
00195   }
00196 
00197   ~File() {
00198     close();
00199   }
00202   void open( const std::string& fileName, int mode ) {
00203     close();
00204     
00205     FileSegment* segment;
00206     _readPosition = 0;
00207     _writePosition = 0;
00208     _readCount = 0;
00209 
00210     _readPointerValid = false;
00211     _writePointerValid = false;
00212 
00213     _mode = mode | std::fstream::binary; // must be in binary mode
00214     _state = std::fstream::goodbit;
00215     _fileName = fileName;
00216 
00217     // open all existing segments
00218 
00219     for( int i=0; ; i++ ) {
00220       segment = new FileSegment();
00221       std::string name = segmentName( _fileName, i );
00222 
00223       // we only care about a few mode bits, the rest are taken 
00224       // opening for read only, so that we can tell if the file 
00225       // exists or not (don't want to 
00226       // open files that don't exist)
00227 
00228       segment->stream.open( name.c_str(), 
00229                             std::ifstream::in | std::ifstream::binary );
00230 
00231       if( segment->stream.rdstate() & std::fstream::failbit ) {
00232         // segment didn't open, so assume that we're done opening
00233         delete segment;
00234         break;
00235       } else if ( mode & std::fstream::trunc ) {
00236         // segment opened properly, but we're doing a trunc, so 
00237         // this segment must go away
00238         segment->stream.close();
00239         delete segment;
00240         _UNLINK(name.c_str());
00241       } else {
00242         // segment opened properly and we'd like to keep it
00243         // if we want write access, we need to close and reopen the file
00244         if( mode & std::fstream::out ) {
00245           segment->stream.close();
00246           segment->stream.open( name.c_str(), 
00247                                 // need to move this ifdef out of here
00248 #ifdef WIN32 
00249                                 mode & (std::fstream::binary | 
00250                                         std::fstream::in | std::fstream::out) );
00251 #else 
00252           std::_Ios_Openmode(mode) & 
00253             (std::fstream::binary | std::fstream::in | 
00254              std::fstream::out) );
00255 #endif
00256           if( segment->stream.rdstate() & std::fstream::failbit ) {
00257             delete segment;
00258             break;
00259           }
00260         }
00261 
00262         // set up segment statistics and add to the segment vector
00263         offset_type length;
00264 
00265         segment->stream.seekg( 0, std::fstream::end );
00266         length = segment->stream.tellg();
00267 
00268         segment->start = _readPosition;
00269         segment->end = _readPosition + length;
00270         _readPosition += length;
00271 
00272         _segments.push_back( segment );
00273       }
00274     }
00275 
00276     // if no segments exist but we plan to write something, make a segment
00277     if( _segments.size() == 0 && (mode & std::fstream::out) ) {
00278       _appendSegment();
00279     }
00280 
00281     // if there are still no segments, quit and fail, otherwise finish setup
00282     if( _segments.size() == 0 ) {
00283       _state |= std::fstream::failbit;
00284     } else {
00285       // initialize these to something reasonable
00286       _readPosition = 0;
00287       _writePosition = 0;
00288       _readSegment = _segments[0];
00289       _writeSegment = _segments[0];
00290 
00291       // seek to the end of the file if necessary for certain flags
00292       // otherwise set up the pointers at file begin
00293       if( mode & (std::fstream::ate|std::fstream::app) ) {
00294         seekg( 0, std::fstream::end );
00295         seekp( 0, std::fstream::end );
00296       } else {
00297         seekg( 0, std::fstream::beg );
00298         seekp( 0, std::fstream::beg );
00299       }
00300     }
00301   }
00302 
00304   void close() {
00305     std::vector<FileSegment*>::iterator iter;
00306 
00307     for( iter = _segments.begin(); iter != _segments.end(); iter++ ) {
00308       (*iter)->stream.close();
00309       delete (*iter);
00310     }
00311 
00312     _segments.clear();
00313   }
00315   void read( void* buffer, offset_type count ) {
00316     _validateReadPointer();
00317     offset_type readAmount;
00318     _readCount = 0;
00319 
00320     while( ((count - _readCount + _readPosition) >= _readSegment->end) &&
00321            _readPosition != size() ) {
00322       readAmount = _readSegment->end - _readPosition;
00323       _readSegment->stream.read( (char*)buffer + _readCount, library_offset_type(readAmount) );
00324       
00325       _readPosition += readAmount;
00326       _readCount += readAmount;
00327       _readPointerValid = false;
00328       _validateReadPointer();
00329     }
00330 
00331     if( _readPosition != size() ) {
00332       _readSegment->stream.read( (char*)buffer + _readCount, library_offset_type(count - _readCount) );
00333       readAmount = library_offset_type(_readSegment->stream.gcount());
00334       _readPosition += readAmount;
00335       _readCount += readAmount;
00336     }
00337 
00338         assert((int)_readSegment->stream.tellg() != -1 || size() == _readPosition );
00339   }
00341   void write( const void* buffer, offset_type count ) {
00342     offset_type bytesWritten = 0;
00343     offset_type writeAmount = 0;
00344     _validateWritePointer();
00345 
00346     while( ((count - bytesWritten) + (_writePosition - _writeSegment->start)) >= FILE_MAXIMUM_SEGMENT_SIZE ) {
00347       writeAmount = FILE_MAXIMUM_SEGMENT_SIZE - (_writePosition - _writeSegment->start);
00348   
00349       _writeSegment->stream.write( (const char*)buffer + bytesWritten, library_offset_type(writeAmount) );
00350       _writePosition += writeAmount;
00351       bytesWritten += writeAmount;
00352 
00353       if( _writePosition > _writeSegment->end ) {
00354         _writeSegment->end = _writePosition;
00355       }
00356 
00357       _appendSegment();
00358       _writePointerValid = false;
00359       _validateWritePointer();
00360     }
00361     
00362     writeAmount = count - bytesWritten;
00363     _writeSegment->stream.write( (const char*)buffer + bytesWritten, library_offset_type(writeAmount) );
00364     _writePosition += writeAmount;
00365     bytesWritten += writeAmount;
00366 
00367     if( _writePosition > _writeSegment->end ) {
00368       _writeSegment->end = _writePosition;
00369     }
00370 
00371        assert( (int)_writeSegment->stream.tellp() != -1 );
00372   }
00374   void seekg( offset_type relativePosition, std::fstream::seekdir direction ) {
00375     _readPosition = _absolutePosition( relativePosition, _readPosition, direction );
00376     _readPointerValid = false;
00377   }
00379   void seekp( offset_type relativePosition, std::fstream::seekdir direction ) {
00380     _writePosition = _absolutePosition( relativePosition, _writePosition, direction );
00381     _writePointerValid = false;
00382   }
00384   offset_type tellg() {
00385     return _readPosition;
00386   }
00388   offset_type tellp() {
00389     return _writePosition;
00390   }
00392   offset_type gcount() {
00393     return _readCount;
00394   }
00395 
00397   int rdstate() {
00398     if( size() == _readPosition )
00399       return _state | std::fstream::eofbit;
00400     else
00401       return _state;
00402   }
00404   offset_type size() const {
00405     if( _segments.size() == 0 ) {
00406       return 0;
00407     } else {
00408       return _segments[ _segments.size()-1 ]->end;
00409     }
00410   }
00412   void unlink() {
00413     close();
00414     File::unlink( _fileName );
00415     _fileName = "";
00416   }
00417 
00419   static void unlink( const std::string& fileName ) {
00420     for( int i=0; ; i++ ) {
00421       std::string segment = segmentName( fileName, i );
00422       if (_UNLINK( segment.c_str() ) != 0 ) {
00423         break;
00424       }
00425     }
00426   }
00428   static void rename( const std::string& oldName, const std::string& newName ) {
00429     for( int i=0; ; i++ ) {
00430       std::string oldSegment = segmentName( oldName, i );
00431       std::string newSegment = segmentName( newName, i );
00432 
00433       if( ::rename( oldSegment.c_str(), newSegment.c_str() ) != 0 ) {
00434         break;
00435       }
00436     }
00437   }
00438 };
00439 
00440 #endif // LEMUR_FILE_HPP

Generated on Fri Jul 2 16:25:36 2004 for Lemur Toolkit by doxygen1.2.18