Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

BasicDocStream.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _BASICFILESTREAM_HPP
00014 #define _BASICFILESTREAM_HPP
00015 
00017 
00055 #include "common_headers.hpp"
00056 #include <cassert>
00057 #include <cstdio>
00058 #include <cstring>
00059 #include "DocStream.hpp"
00060 #include "Exception.hpp"
00061 
00062 
00063 #define MAXLINE 65536
00064 
00065 
00067 class BasicTokenTerm : public TokenTerm {
00068  public:
00069   BasicTokenTerm() {}
00070   virtual ~BasicTokenTerm() {}
00071   virtual const char *spelling() { return str;}
00072   friend class BasicTokenDoc;
00073  private:
00074   char *str;
00075 };
00076 
00077 
00079 
00080 class BasicTokenDoc : public Document {
00081  public:
00082   BasicTokenDoc() {
00083   }
00084   BasicTokenDoc(ifstream *stream): docStr(stream) {
00085   }
00086    void startTermIteration(); 
00087   
00088   char *getID() const { return (char *)id;}
00089 
00090   bool hasMore() { return (strcmp(curWord, "</DOC>") != 0);}
00091     
00092   TokenTerm * nextTerm();
00093 
00094   void skipToEnd();
00095   friend class BasicDocStream;
00096  private:
00097   void readID(); 
00098   char *curWord;
00099   char buf1[20000];
00100   char buf2[20000];
00101   char id[2000];
00102   ifstream *docStr;
00103   streampos startPos; // starting position of the terms in the file
00104   //replace  static BasicTokenTerm t; with attribute
00105   BasicTokenTerm t;
00106 };
00107 
00108 
00110 class BasicDocStream : public DocStream
00111 {
00112 public:
00113   BasicDocStream() {}
00114   BasicDocStream (const char * inputFile);
00115 
00116   virtual ~BasicDocStream() {  delete ifs;}
00117 
00118 public:
00119         
00120   bool hasMore(); 
00121 
00122   void startDocIteration();
00123 
00124   Document *nextDoc();
00125 
00126 private:
00127   char file[1024];
00128   ifstream *ifs;
00129   char buf[2000];
00130   bool nextTokenRead;
00131   // replace static BasicTokenDoc doc;  with attribute
00132   BasicTokenDoc doc;
00133 };
00134 
00135 
00136 
00137 
00138 #endif
00139 
00140 
00141 
00142 

Generated on Fri Feb 6 07:11:45 2004 for LEMUR by doxygen1.2.16