00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _INVFPPUSHINDEX_HPP 00014 #define _INVFPPUSHINDEX_HPP 00015 00017 00024 /* 00025 * NAME DATE - COMMENTS 00026 * tnt 03/01 - created 00027 ======================================================================*/ 00028 #include "common_headers.hpp" 00029 #include "PushIndex.hpp" 00030 #include "MemCache.hpp" 00031 #include "InvFPTypes.hpp" 00032 #include "InvFPDocList.hpp" 00033 #include "InvFPTerm.hpp" 00034 #include "InvFPIndexMerge.hpp" 00035 00036 00037 typedef map<char*, InvFPDocList*, ltstr> TABLE_T; 00038 00039 class InvFPPushIndex : public PushIndex { 00040 public: 00041 InvFPPushIndex(char* prefix="DefaultIndex", int cachesize=128000000, long maxfilesize=2100000000, DOCID_T startdocid=1); 00042 ~InvFPPushIndex(); 00043 00045 void setName(char* prefix); 00046 00048 bool beginDoc(DocumentProps* dp); 00049 00051 bool addTerm(Term& t); 00052 00054 void endDoc(DocumentProps* dp); 00055 00057 void endCollection(CollectionProps* cp); 00058 00059 00060 private: 00061 void writeTOC(int numinv); 00062 void writeDocIDs(); 00063 void writeDTIDs(); 00064 void writeCache(); 00065 void lastWriteCache(); 00066 00067 long maxfile; 00068 MemCache* cache; 00069 // FILE* writetlist; /// filestream for writing the list of located terms for each document 00070 ofstream writetlist; 00071 FILE* writetlookup; 00072 vector<LocatedTerm> termlist; 00073 vector<char*> docIDs; 00074 vector<char*> termIDs; 00075 vector<char*> tempfiles; 00076 vector<char*> dtfiles; 00077 00078 int tcount; 00079 int tidcount ; 00080 int dtidcount; 00081 char* name; 00082 int namelen; 00083 TABLE_T wordtable; 00084 00085 int* membuf; 00086 int membufsize; // how much memory we have 00087 }; 00088 00089 #endif