00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _INVPUSHINDEX_HPP 00014 #define _INVPUSHINDEX_HPP 00015 00017 00024 /* 00025 * NAME DATE - COMMENTS 00026 * tnt 01/02 - created 00027 ======================================================================*/ 00028 #include "common_headers.hpp" 00029 #include "PushIndex.hpp" 00030 #include "MemCache.hpp" 00031 #include "InvFPTypes.hpp" 00032 #include "InvDocList.hpp" 00033 #include "InvFPTerm.hpp" 00034 #include "InvIndexMerge.hpp" 00035 00036 00037 typedef map<string, InvDocList*, less<string> > TABLE_T; 00038 00039 class InvPushIndex : public PushIndex { 00040 public: 00041 InvPushIndex(){ }; 00042 InvPushIndex(const string &prefix, int cachesize=128000000, long maxfilesize=2100000000, DOCID_T startdocid=1); 00043 ~InvPushIndex(); 00044 00046 void setName(const string &prefix); 00047 00049 bool beginDoc(const DocumentProps* dp); 00050 00052 bool addTerm(const Term& t); 00053 00055 void endDoc(const DocumentProps* dp); 00056 00058 virtual void endDoc(const DocumentProps* dp, const string &mgr); 00059 00061 void endCollection(const CollectionProps* cp); 00062 00064 void setDocManager(const string &mgrID); 00065 00066 protected: 00067 void writeTOC(int numinv); 00068 void writeDocIDs(); 00069 void writeCache(); 00070 void lastWriteCache(); 00071 void writeDTIDs(); 00072 void writeDocMgrIDs(); 00075 int docMgrID(const string &mgr); 00076 virtual void doendDoc(const DocumentProps* dp, int mgrid); 00077 00078 long maxfile; 00079 MemCache* cache; 00080 vector<EXDOCID_T> docIDs; 00081 vector<TERM_T> termIDs; 00082 vector<string> tempfiles; 00083 vector<string> dtfiles; 00084 vector<string> docmgrs; // the list of doc managers we have (index = id) 00085 00086 FILE* writetlookup; 00087 ofstream writetlist; 00088 00089 COUNT_T tcount; 00090 COUNT_T tidcount ; 00091 COUNT_T dtidcount; 00092 string name; 00093 TABLE_T wordtable; 00094 map<TERMID_T, COUNT_T> termlist; 00095 int* membuf; 00096 int membufsize; // how much memory we have 00097 int curdocmgr; // the current docmanager to use 00098 }; 00099 00100 #endif