Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

FlattextDocMgr.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 #ifndef _FLATTEXTDOCMGR_HPP
00012 #define _FLATTEXTDOCMGR_HPP
00013 
00014 #include "common_headers.hpp"
00015 #include "TrecParser.hpp"
00016 #include "WebParser.hpp"
00017 #include "ReutersParser.hpp"
00018 #include "ChineseParser.hpp"
00019 #include "ChineseCharParser.hpp"
00020 #include "ArabicParser.hpp"
00021 #include "Exception.hpp"
00022 #include "DocumentManager.hpp"
00023 
00024 #define FT_SUFFIX ".flat"
00025 #define FT_LOOKUP ".lookup"
00026 #define FT_FID    ".fid"
00027 
00028 class FlattextDocMgr : public DocumentManager, public TextHandler {
00029 public:
00030   enum ParseModes {TREC=0, WEB=1, CHINESE=2, CHINESECHAR=3, ARABIC=4};
00031 
00032   struct lookup_e {
00033     int fid;
00034     int offset;
00035     int bytes;
00036   };
00037 
00038   struct abc {
00039     bool operator() (char* s1, char* s2) const {
00040       return strcmp(s1, s2) < 0;
00041     }
00042   };
00043 
00048   FlattextDocMgr(char* name, ParseModes mode, char* source);
00049   
00052   FlattextDocMgr(const char* name);
00053 
00054   ~FlattextDocMgr();
00055 
00057   bool open(const char*manname);
00058 
00060   const char* getMyID();
00061 
00063   char* getDoc(const char* docID);
00064 
00065   void buildMgr();
00066 
00067   char* handleDoc(char * docno);
00068 
00070   static Parser* createParser(ParseModes mode);
00071 
00072 private:
00074   bool readinSources(char* fn);
00075 
00077   void writeTOC();
00078 
00079   bool loadTOC(char* fn);
00080   bool loadFTLookup(const char* fn);
00081   bool loadFTFiles(const char* fn, int num);
00082 
00083   int numdocs;              // how many docs we have
00084   ParseModes pm;           // what type of parser we have
00085   long prevpos;              // pos of previous doc beginning
00086   string IDname;            // my name
00087   vector<string> sources;   // list of all source files
00088   int fileid, lastid;       // fileid of current/last file being processed
00089   ofstream writefpos;       // stream for writing out file positions
00090   map<char*, lookup_e, abc> table; 
00091 };
00092 
00093 #endif

Generated on Mon Sep 30 14:13:21 2002 for LEMUR by doxygen1.2.18