Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

FlattextDocMgr.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 #ifndef _FLATTEXTDOCMGR_HPP
00012 #define _FLATTEXTDOCMGR_HPP
00013 
00014 #include "common_headers.hpp"
00015 #include "TextHandlerManager.hpp"
00016 #include "Exception.hpp"
00017 #include "DocumentManager.hpp"
00018 
00019 #define FT_SUFFIX ".flat"
00020 #define FT_LOOKUP ".lookup"
00021 #define FT_FID    ".fid"
00022 
00023 class FlattextDocMgr : public DocumentManager, public TextHandler {
00024 public:
00025   enum ParseModes {TREC=0, WEB=1, CHINESE=2, CHINESECHAR=3, ARABIC=4};
00026 
00027   struct lookup_e {
00028     int fid;
00029     int offset;
00030     int bytes;
00031   };
00032 
00033   struct abc {
00034     bool operator() (char* s1, char* s2) const {
00035       return strcmp(s1, s2) < 0;
00036     }
00037   };
00038 
00043   FlattextDocMgr(char* name, ParseModes mode, char* source);
00044   FlattextDocMgr(string name, string mode, string source);  
00045 
00048   FlattextDocMgr(const char* name);
00049 
00050   ~FlattextDocMgr();
00051 
00053   bool open(const char*manname);
00054 
00056   const char* getMyID();
00057 
00059   char* getDoc(const char* docID);
00060 
00061   void buildMgr();
00062 
00063   char* handleDoc(char * docno);
00064 
00066   static Parser* createParser(ParseModes mode);
00067 
00068 private:
00070   bool readinSources(const char* fn);
00071 
00073   void writeTOC();
00074 
00075   bool loadTOC(const char* fn);
00076   bool loadFTLookup(const char* fn);
00077   bool loadFTFiles(const char* fn, int num);
00078 
00079   int numdocs;              // how many docs we have
00080   ParseModes pm;           // what type of parser we have
00081   long prevpos;              // pos of previous doc beginning
00082   string IDname;            // my name
00083   vector<string> sources;   // list of all source files
00084   int fileid, lastid;       // fileid of current/last file being processed
00085   ofstream writefpos;       // stream for writing out file positions
00086   map<char*, lookup_e*, abc> table; 
00087   lookup_e* entries;        // array of lookup entries
00088 };
00089 
00090 #endif

Generated on Tue Nov 25 11:26:43 2003 for Lemur Toolkit by doxygen1.2.18