00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _INVPUSHINDEX_HPP 00014 #define _INVPUSHINDEX_HPP 00015 00017 00024 /* 00025 * NAME DATE - COMMENTS 00026 * tnt 01/02 - created 00027 ======================================================================*/ 00028 #include "common_headers.hpp" 00029 #include "PushIndex.hpp" 00030 #include "MemCache.hpp" 00031 #include "InvFPTypes.hpp" 00032 #include "InvDocList.hpp" 00033 #include "InvFPTerm.hpp" 00034 #include "InvIndexMerge.hpp" 00035 00036 00037 typedef map<char*, InvDocList*, ltstr> TABLE_T; 00038 00039 class InvPushIndex : public PushIndex { 00040 public: 00041 InvPushIndex(char* prefix="DefaultIndex", int cachesize=128000000, long maxfilesize=2100000000, DOCID_T startdocid=1); 00042 ~InvPushIndex(); 00043 00045 void setName(char* prefix); 00046 00048 bool beginDoc(DocumentProps* dp); 00049 00051 bool addTerm(Term& t); 00052 00054 void endDoc(DocumentProps* dp); 00055 00057 virtual void endDoc(DocumentProps* dp, const char* mgr); 00058 00060 void endCollection(CollectionProps* cp); 00061 00063 void setDocManager(const char* mgrID); 00064 00065 protected: 00066 void writeTOC(int numinv); 00067 void writeDocIDs(); 00068 void writeCache(); 00069 void lastWriteCache(); 00070 void writeDTIDs(); 00071 void writeDocMgrIDs(); 00074 int docMgrID(const char* mgr); 00075 virtual void doendDoc(DocumentProps* dp, int mgrid); 00076 00077 long maxfile; 00078 MemCache* cache; 00079 vector<char*> docIDs; 00080 vector<char*> termIDs; 00081 vector<char*> tempfiles; 00082 vector<char*> dtfiles; 00083 vector<char*> docmgrs; // the list of doc managers we have (index = id) 00084 00085 FILE* writetlookup; 00086 ofstream writetlist; 00087 00088 int tcount; 00089 int tidcount ; 00090 int dtidcount; 00091 char* name; 00092 int namelen; 00093 TABLE_T wordtable; 00094 map<int, int> termlist; 00095 int* membuf; 00096 int membufsize; // how much memory we have 00097 int curdocmgr; // the current docmanager to use 00098 }; 00099 00100 #endif
1.2.4 written by Dimitri van Heesch,
© 1997-2000