Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

BasicDocStream.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _BASICFILESTREAM_HPP
00014 #define _BASICFILESTREAM_HPP
00015 
00017 
00055 #include "common_headers.hpp"
00056 #include <cassert>
00057 #include <cstdio>
00058 #include <cstring>
00059 #include "DocStream.hpp"
00060 #include "Exception.hpp"
00061 
00062 
00063 #define MAXLINE 65536
00064 
00065 
00067 class BasicTokenTerm : public TokenTerm {
00068  public:
00069   BasicTokenTerm() {}
00070   virtual ~BasicTokenTerm() {}
00071   virtual const char *spelling() { return str;}
00072   friend class BasicTokenDoc;
00073  private:
00074   char *str;
00075 };
00076 
00077 
00079 
00080 class BasicTokenDoc : public Document {
00081  public:
00082   BasicTokenDoc(ifstream *stream): docStr(stream) {
00083   }
00084    void startTermIteration(); 
00085   
00086   char *getID() const { return (char *)id;}
00087 
00088   bool hasMore() { return (strcmp(curWord, "</DOC>") != 0);}
00089     
00090   TokenTerm * nextTerm();
00091 
00092   void skipToEnd();
00093   friend class BasicDocStream;
00094  private:
00095   void readID(); 
00096   char *curWord;
00097   char buf1[20000];
00098   char buf2[20000];
00099   char id[2000];
00100   ifstream *docStr;
00101   streampos startPos; // starting position of the terms in the file
00102 };
00103 
00104 
00106 class BasicDocStream : public DocStream
00107 {
00108 public:
00109   BasicDocStream() {}
00110   BasicDocStream (const char * inputFile);
00111 
00112   virtual ~BasicDocStream() {  delete ifs;}
00113 
00114 public:
00115         
00116   bool hasMore(); 
00117 
00118   void startDocIteration();
00119 
00120   Document *nextDoc();
00121 
00122 private:
00123   char file[1024];
00124   ifstream *ifs;
00125   char buf[2000];
00126   bool nextTokenRead;
00127 };
00128 
00129 
00130 
00131 
00132 #endif
00133 
00134 
00135 
00136 

Generated on Mon Sep 30 14:13:20 2002 for LEMUR by doxygen1.2.18