00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _BASICFILESTREAM_HPP 00014 #define _BASICFILESTREAM_HPP 00015 00017 00055 #include "common_headers.hpp" 00056 #include <cassert> 00057 #include <cstdio> 00058 #include <cstring> 00059 #include "DocStream.hpp" 00060 #include "Exception.hpp" 00061 00062 00063 #define MAXLINE 65536 00064 00065 00067 class BasicTokenTerm : public TokenTerm { 00068 public: 00069 BasicTokenTerm() {} 00070 virtual ~BasicTokenTerm() {} 00071 virtual const char *spelling() { return str;} 00072 friend class BasicTokenDoc; 00073 private: 00074 char *str; 00075 }; 00076 00077 00079 00080 class BasicTokenDoc : public Document { 00081 public: 00082 BasicTokenDoc() { 00083 } 00084 BasicTokenDoc(ifstream *stream): docStr(stream) { 00085 } 00086 void startTermIteration(); 00087 00088 char *getID() const { return (char *)id;} 00089 00090 bool hasMore() { return (strcmp(curWord, "</DOC>") != 0);} 00091 00092 TokenTerm * nextTerm(); 00093 00094 void skipToEnd(); 00095 friend class BasicDocStream; 00096 private: 00097 void readID(); 00098 char *curWord; 00099 char buf1[20000]; 00100 char buf2[20000]; 00101 char id[2000]; 00102 ifstream *docStr; 00103 streampos startPos; // starting position of the terms in the file 00104 //replace static BasicTokenTerm t; with attribute 00105 BasicTokenTerm t; 00106 }; 00107 00108 00110 class BasicDocStream : public DocStream 00111 { 00112 public: 00113 BasicDocStream() {} 00114 BasicDocStream (const char * inputFile); 00115 00116 virtual ~BasicDocStream() { delete ifs;} 00117 00118 public: 00119 00120 bool hasMore(); 00121 00122 void startDocIteration(); 00123 00124 Document *nextDoc(); 00125 00126 private: 00127 char file[1024]; 00128 ifstream *ifs; 00129 char buf[2000]; 00130 bool nextTokenRead; 00131 // replace static BasicTokenDoc doc; with attribute 00132 BasicTokenDoc doc; 00133 }; 00134 00135 00136 00137 00138 #endif 00139 00140 00141 00142