Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

KeyfileDocMgr.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 #ifndef _LEMUR_KEYFILE_DOCMGR_HPP
00012 #define _LEMUR_KEYFILE_DOCMGR_HPP
00013 
00014 #include "common_headers.hpp"
00015 #include "DocumentManager.hpp"
00016 #include "RVLCompress.hpp"
00017 #include "TextHandlerManager.hpp"
00018 #include "Match.hpp"
00019 #include "Keyfile.hpp"
00020 
00021 // array of byte offsets, indexed by token for each doc.
00022 #define BT_POSITIONS ".btp"
00023 // source file start, length
00024 #define BT_LOOKUP ".btl"
00025 // TOC
00026 #define BT_TOC ".bdm"
00027 // source files.
00028 #define BT_FID ".bfi"
00029 
00030 
00039 class KeyfileDocMgr : public DocumentManager, public TextHandler {
00040 public:
00042   KeyfileDocMgr() {  myDoc = NULL;  numdocs = 0; }
00043 
00046   KeyfileDocMgr(const char *name);
00047 
00052   KeyfileDocMgr(string name, string mode, string source);  
00053 
00054   virtual ~KeyfileDocMgr();
00055 
00057   char* getDoc(const char* docID);
00059   virtual char* handleDoc(char * docno);
00061   virtual void handleEndDoc();
00063   virtual char *handleWord(char * word) {
00064     if (word != NULL) {
00065       int end = myparser->fileTell() - 1;
00066       int start = (end - strlen(word)) + 1;
00067       Match m;
00068       m.start = start - docEntry.offset;
00069       m.end = end - docEntry.offset;    
00070       offsets.push_back(m);
00071     }
00072     return word;
00073   }
00075   virtual void setParser(Parser *p) {
00076     myparser = p;
00077   }
00078 
00081   virtual void buildMgr();
00083   virtual const char *getMyID() {
00084     return IDnameext.c_str();
00085   }
00086 
00090   vector<Match> getOffsets(char *docID);
00091 
00093   virtual bool open(const char* manname) {
00094     string tmp(manname);
00095     IDname = tmp.substr(0, tmp.length() - 4);
00096     return loadTOC();
00097   }
00098 
00099 protected:
00100   struct btl {
00101     int fid;
00102     long offset;
00103     long bytes;
00104   };
00105 
00106   virtual void writeTOC();
00107   virtual bool loadTOC();
00108   bool loadFTFiles(const char* fn, int num);
00109   vector <Match> offsets;
00110   int numdocs;              // how many docs we have
00111   string pm;  // parse mode
00112 
00113   Keyfile poslookup; // btree for lookup to positions list.
00114   Keyfile doclookup; // btree for lookup to doc start.
00115   int dbcache;
00116   
00117   btl docEntry;
00118   char *myDoc;
00119   int doclen;
00120   string IDname;            // my name
00121   string IDnameext;                     // my name w/ extension
00122   vector<string> sources;   // list of all source files
00123   int fileid;       // fileid of current/last file being processed
00124 };
00125 
00126 #endif // _LEMUR_KEYFILE_DOCMGR_HPP

Generated on Fri Feb 6 07:11:47 2004 for LEMUR by doxygen1.2.16