Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

FlattextDocMgr.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _FLATTEXTDOCMGR_HPP
00014 #define _FLATTEXTDOCMGR_HPP
00015 
00016 #include "common_headers.hpp"
00017 #include "TrecParser.hpp"
00018 #include "WebParser.hpp"
00019 #include "ReutersParser.hpp"
00020 #include "Exception.hpp"
00021 #include "DocumentManager.hpp"
00022 
00023 #define FT_SUFFIX ".flat"
00024 #define FT_LOOKUP ".lookup"
00025 #define FT_FID    ".fid"
00026 
00027 class FlattextDocMgr : public DocumentManager, public TextHandler {
00028 public:
00029   enum ParseModes {TREC=0, WEB=1};
00030 
00031   struct lookup_e {
00032     int fid;
00033     int offset;
00034     int bytes;
00035   };
00036 
00037   struct abc {
00038     bool operator() (char* s1, char* s2) const {
00039       return strcmp(s1, s2) < 0;
00040     }
00041   };
00042 
00047   FlattextDocMgr(char* name, ParseModes mode, char* source);
00048   
00051   FlattextDocMgr(const char* name);
00052 
00053   ~FlattextDocMgr();
00054 
00056   bool open(const char*manname);
00057 
00059   const char* getMyID();
00060 
00062   char* getDoc(const char* docID);
00063 
00064   void buildMgr();
00065 
00066   char* handleDoc(char * docno);
00067 
00069   static Parser* createParser(ParseModes mode);
00070 
00071 private:
00073   bool readinSources(char* fn);
00074 
00076   void writeTOC();
00077 
00078   bool loadTOC(char* fn);
00079   bool loadFTLookup(const char* fn);
00080   bool loadFTFiles(const char* fn, int num);
00081 
00082   int numdocs;              // how many docs we have
00083   ParseModes pm;           // what type of parser we have
00084   long prevpos;              // pos of previous doc beginning
00085   string IDname;            // my name
00086   vector<string> sources;   // list of all source files
00087   int fileid, lastid;       // fileid of current/last file being processed
00088   ofstream writefpos;       // stream for writing out file positions
00089   map<char*, lookup_e, abc> table; 
00090 };
00091 
00092 #endif

Generated at Fri Jul 26 18:26:22 2002 for LEMUR by doxygen1.2.4 written by Dimitri van Heesch, © 1997-2000