00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _BASICFILESTREAM_HPP 00014 #define _BASICFILESTREAM_HPP 00015 00017 00055 #include "common_headers.hpp" 00056 #include <cassert> 00057 #include <cstdio> 00058 #include <cstring> 00059 #include "DocStream.hpp" 00060 #include "Exception.hpp" 00061 00062 00063 #define MAXLINE 65536 00064 00065 00067 class BasicTokenTerm : public TokenTerm { 00068 public: 00069 BasicTokenTerm() {} 00070 virtual ~BasicTokenTerm() {} 00071 virtual const char *spelling() { return str;} 00072 friend class BasicTokenDoc; 00073 private: 00074 char *str; 00075 }; 00076 00077 00079 00080 class BasicTokenDoc : public Document { 00081 public: 00082 BasicTokenDoc(ifstream *stream): docStr(stream) { 00083 } 00084 void startTermIteration(); 00085 00086 char *getID() const { return (char *)id;} 00087 00088 bool hasMore() { return (strcmp(curWord, "</DOC>") != 0);} 00089 00090 TokenTerm * nextTerm(); 00091 00092 void skipToEnd(); 00093 friend class BasicDocStream; 00094 private: 00095 void readID(); 00096 char *curWord; 00097 char buf1[20000]; 00098 char buf2[20000]; 00099 char id[2000]; 00100 ifstream *docStr; 00101 streampos startPos; // starting position of the terms in the file 00102 }; 00103 00104 00106 class BasicDocStream : public DocStream 00107 { 00108 public: 00109 BasicDocStream() {} 00110 BasicDocStream (const char * inputFile); 00111 00112 virtual ~BasicDocStream() { delete ifs;} 00113 00114 public: 00115 00116 bool hasMore(); 00117 00118 void startDocIteration(); 00119 00120 Document *nextDoc(); 00121 00122 private: 00123 char file[1024]; 00124 ifstream *ifs; 00125 char buf[2000]; 00126 bool nextTokenRead; 00127 }; 00128 00129 00130 00131 00132 #endif 00133 00134 00135 00136