00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _BASICFILESTREAM_HPP 00014 #define _BASICFILESTREAM_HPP 00015 00017 00055 #include "common_headers.hpp" 00056 #include <cassert> 00057 #include <cstdio> 00058 #include <cstring> 00059 #include "DocStream.hpp" 00060 #include "Exception.hpp" 00061 00062 00063 #define MAXLINE 65536 00064 00065 00067 class BasicTokenTerm : public TokenTerm { 00068 public: 00069 BasicTokenTerm() {} 00070 virtual ~BasicTokenTerm() {} 00071 virtual const char *spelling() { return str;} 00072 friend class BasicTokenDoc; 00073 private: 00074 char *str; 00075 }; 00076 00077 00079 00080 class BasicTokenDoc : public Document { 00081 public: 00082 BasicTokenDoc(ifstream *stream): docStr(stream) { 00083 } 00084 void startTermIteration(); 00085 00086 char *getID() const { return (char *)id;} 00087 00088 bool hasMore() { return (strcmp(curWord, "</DOC>") != 0);} 00089 00090 TokenTerm * nextTerm(); 00091 00092 friend class BasicDocStream; 00093 private: 00094 void readID(); 00095 char *curWord; 00096 char buf1[20000]; 00097 char buf2[20000]; 00098 char id[2000]; 00099 ifstream *docStr; 00100 }; 00101 00102 00104 class BasicDocStream : public DocStream 00105 { 00106 public: 00107 BasicDocStream() {} 00108 BasicDocStream (const char * inputFile); 00109 00110 virtual ~BasicDocStream() { delete ifs;} 00111 00112 public: 00113 00114 bool hasMore(); 00115 00116 void startDocIteration(); 00117 00118 Document *nextDoc(); 00119 00120 private: 00121 char file[1024]; 00122 ifstream *ifs; 00123 char buf[2000]; 00124 bool nextTokenRead; 00125 }; 00126 00127 00128 00129 00130 #endif 00131 00132 00133 00134