00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 /* 00012 10/22/2002 -- dmf Add arrays dtfstreams and invfstreams to hold open 00013 ifstreams for the inverted list files so that each is opened and closed 00014 only once. 00015 */ 00016 00017 #ifndef _INVINDEX_HPP 00018 #define _INVINDEX_HPP 00019 00021 #include "common_headers.hpp" 00022 #include "Index.hpp" 00023 #include "InvDocList.hpp" 00024 #include "InvTermList.hpp" 00025 #include "InvFPTypes.hpp" 00026 #include "Param.hpp" 00027 #include "DocMgrManager.hpp" 00028 00029 // for counts array 00030 #define UNIQUE_TERMS 0 00031 #define TOTAL_TERMS 1 00032 #define DOCS 2 00033 #define DT_FILES 3 00034 #define INV_FILES 4 00035 00036 #define NAMES_SIZE 8 00037 // for names array 00038 #define DOC_INDEX 0 00039 #define DOC_LOOKUP 1 00040 #define TERM_INDEX 2 00041 #define TERM_LOOKUP 3 00042 #define TERM_IDS 4 00043 #define DOC_IDS 5 00044 #define DOCMGR_IDS 6 00045 #define VERSION_NUM 7 00046 00047 class InvIndex : public Index { 00048 public: 00049 InvIndex(); 00050 InvIndex(const char* indexName); 00051 ~InvIndex(); 00052 00054 00055 00057 bool open(const char* indexName); 00059 00061 00062 00064 int term(const char* word); 00065 00067 const char* term(int termID); 00068 00070 int document(const char* docIDStr); 00071 00073 const char* document(int docID); 00074 00075 // const char* docManager(int docID); 00076 DocumentManager* docManager(int docID); 00077 00079 00081 00082 00084 int docCount() { return counts[DOCS]; }; 00085 00087 int termCountUnique() { return counts[UNIQUE_TERMS]; }; 00088 00090 int termCount(int termID) const; 00091 00093 int termCount() const { return counts[TOTAL_TERMS]; }; 00094 00096 float docLengthAvg(); 00097 00099 int docCount(int termID); 00100 00102 int docLength(DOCID_T docID) const; 00103 00105 virtual int docLengthCounted(int docID); 00106 00108 00110 00111 00112 DocInfoList* docInfoList(int termID); 00113 00115 TermInfoList* termInfoList(int docID); 00116 00118 00120 void setMesgStream(ostream * lemStream); 00121 00122 protected: 00124 bool fullToc(const char* fileName); 00126 bool indexLookup(); 00128 bool invFileIDs(); 00130 bool docMgrIDs(); 00132 bool dtLookup(); 00134 bool dtLookup_ver1(); 00136 bool dtFileIDs(); 00138 bool termIDs(); 00140 bool docIDs(); 00141 00142 00143 int* counts; // array to hold all the overall count stats of this db 00144 char** names; // array to hold all the names for files we need for this db 00145 float aveDocLen; // the average document length in this index 00146 inv_entry* lookup; // the array holding entries (index is termid) 00147 dt_entry* dtlookup; // the array holding entries to dt index (index of array is docid) 00148 int dtloaded; // indicate load status of the dt index (loaded or not) 00149 TERM_T* terms; // array of the term spellings (index is termid) 00150 EXDOCID_T* docnames; // array of the external docids (index is docid) 00151 char** dtfiles; // array of dt index filenames 00152 ifstream *dtfstreams; // array of dt index input streams 00153 char** invfiles; // array of inv index filenames 00154 ifstream *invfstreams; // array of inv index input streams 00155 // vector<char*> docmgrs; // list of document managers 00156 vector<DocumentManager*> docmgrs; // list of document managers 00157 map<TERM_T, TERMID_T, ltstr> termtable; // table of terms to termid 00158 map<EXDOCID_T, DOCID_T, ltstr> doctable; // table of exdocids to docid 00159 ostream* msgstream; // Lemur code messages stream 00160 }; 00161 00162 #endif