00001 /*========================================================================== 00002 * 00003 * Original source copyright (c) 2001, Carnegie Mellon University. 00004 * See copyright.cmu for details. 00005 * Modifications copyright (c) 2002, University of Massachusetts. 00006 * See copyright.umass for details. 00007 * 00008 *========================================================================== 00009 */ 00010 00011 #ifndef _LEMUR_KEYFILE_DOCMGR_HPP 00012 #define _LEMUR_KEYFILE_DOCMGR_HPP 00013 00014 #include "common_headers.hpp" 00015 #include "DocumentManager.hpp" 00016 #include "RVLCompress.hpp" 00017 #include "TextHandlerManager.hpp" 00018 #include "Match.hpp" 00019 #include "Keyfile.hpp" 00020 00021 // array of byte offsets, indexed by token for each doc. 00022 #define BT_POSITIONS ".btp" 00023 // source file start, length 00024 #define BT_LOOKUP ".btl" 00025 // TOC 00026 #define BT_TOC ".bdm" 00027 // source files. 00028 #define BT_FID ".bfi" 00029 00030 00039 class KeyfileDocMgr : public DocumentManager, public TextHandler { 00040 public: 00042 KeyfileDocMgr() { myDoc = NULL; numdocs = 0; } 00043 00046 KeyfileDocMgr(const char *name); 00047 00052 KeyfileDocMgr(string name, string mode, string source); 00053 00054 virtual ~KeyfileDocMgr(); 00055 00057 char* getDoc(const char* docID); 00059 virtual char* handleDoc(char * docno); 00061 virtual void handleEndDoc(); 00063 virtual char *handleWord(char * word) { 00064 if (word != NULL) { 00065 int end = myparser->fileTell() - 1; 00066 int start = (end - strlen(word)) + 1; 00067 Match m; 00068 m.start = start - docEntry.offset; 00069 m.end = end - docEntry.offset; 00070 offsets.push_back(m); 00071 } 00072 return word; 00073 } 00075 virtual void setParser(Parser *p) { 00076 myparser = p; 00077 } 00078 00081 virtual void buildMgr(); 00083 virtual const char *getMyID() { 00084 return IDnameext.c_str(); 00085 } 00086 00090 vector<Match> getOffsets(char *docID); 00091 00093 virtual bool open(const char* manname) { 00094 string tmp(manname); 00095 IDname = tmp.substr(0, tmp.length() - 4); 00096 return loadTOC(); 00097 } 00098 00099 protected: 00100 struct btl { 00101 int fid; 00102 long offset; 00103 long bytes; 00104 }; 00105 00106 virtual void writeTOC(); 00107 virtual bool loadTOC(); 00108 bool loadFTFiles(const char* fn, int num); 00109 vector <Match> offsets; 00110 int numdocs; // how many docs we have 00111 string pm; // parse mode 00112 00113 Keyfile poslookup; // btree for lookup to positions list. 00114 Keyfile doclookup; // btree for lookup to doc start. 00115 int dbcache; 00116 00117 btl docEntry; 00118 char *myDoc; 00119 int doclen; 00120 string IDname; // my name 00121 string IDnameext; // my name w/ extension 00122 vector<string> sources; // list of all source files 00123 int fileid; // fileid of current/last file being processed 00124 }; 00125 00126 #endif // _LEMUR_KEYFILE_DOCMGR_HPP