00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _INVPUSHINDEX_HPP 00014 #define _INVPUSHINDEX_HPP 00015 00017 00024 /* 00025 * NAME DATE - COMMENTS 00026 * tnt 01/02 - created 00027 ======================================================================*/ 00028 #include "common_headers.hpp" 00029 #include "PushIndex.hpp" 00030 #include "MemCache.hpp" 00031 #include "InvFPTypes.hpp" 00032 #include "InvDocList.hpp" 00033 #include "InvFPTerm.hpp" 00034 #include "InvIndexMerge.hpp" 00035 00036 00037 typedef map<char*, InvDocList*, ltstr> TABLE_T; 00038 00039 class InvPushIndex : public PushIndex { 00040 public: 00041 InvPushIndex(){ }; 00042 InvPushIndex(char* prefix, int cachesize=128000000, long maxfilesize=2100000000, DOCID_T startdocid=1); 00043 ~InvPushIndex(); 00044 00046 void setName(char* prefix); 00047 00049 bool beginDoc(DocumentProps* dp); 00050 00052 bool addTerm(Term& t); 00053 00055 void endDoc(DocumentProps* dp); 00056 00058 virtual void endDoc(DocumentProps* dp, const char* mgr); 00059 00061 void endCollection(CollectionProps* cp); 00062 00064 void setDocManager(const char* mgrID); 00065 00066 protected: 00067 void writeTOC(int numinv); 00068 void writeDocIDs(); 00069 void writeCache(); 00070 void lastWriteCache(); 00071 void writeDTIDs(); 00072 void writeDocMgrIDs(); 00075 int docMgrID(const char* mgr); 00076 virtual void doendDoc(DocumentProps* dp, int mgrid); 00077 00078 long maxfile; 00079 MemCache* cache; 00080 vector<char*> docIDs; 00081 vector<char*> termIDs; 00082 vector<char*> tempfiles; 00083 vector<char*> dtfiles; 00084 vector<char*> docmgrs; // the list of doc managers we have (index = id) 00085 00086 FILE* writetlookup; 00087 ofstream writetlist; 00088 00089 int tcount; 00090 int tidcount ; 00091 int dtidcount; 00092 char* name; 00093 int namelen; 00094 TABLE_T wordtable; 00095 map<int, int> termlist; 00096 int* membuf; 00097 int membufsize; // how much memory we have 00098 int curdocmgr; // the current docmanager to use 00099 }; 00100 00101 #endif