00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _INVINDEX_HPP 00014 #define _INVINDEX_HPP 00015 00017 #include "common_headers.hpp" 00018 #include "Index.hpp" 00019 #include "InvDocList.hpp" 00020 #include "InvTermList.hpp" 00021 #include "InvFPTypes.hpp" 00022 #include "Param.hpp" 00023 00024 // for counts array 00025 #define UNIQUE_TERMS 0 00026 #define TOTAL_TERMS 1 00027 #define DOCS 2 00028 #define DT_FILES 3 00029 #define INV_FILES 4 00030 00031 #define NAMES_SIZE 8 00032 // for names array 00033 #define DOC_INDEX 0 00034 #define DOC_LOOKUP 1 00035 #define TERM_INDEX 2 00036 #define TERM_LOOKUP 3 00037 #define TERM_IDS 4 00038 #define DOC_IDS 5 00039 #define DOCMGR_IDS 6 00040 #define VERSION_NUM 7 00041 00042 class InvIndex : public Index { 00043 public: 00044 InvIndex(); 00045 InvIndex(const char* indexName); 00046 ~InvIndex(); 00047 00049 00050 00052 bool open(const char* indexName); 00054 00056 00057 00059 int term(const char* word); 00060 00062 const char* term(int termID); 00063 00065 int document(const char* docIDStr); 00066 00068 const char* document(int docID); 00069 00070 const char* docManager(int docID); 00071 00073 00075 00076 00078 int docCount() { return counts[DOCS]; }; 00079 00081 int termCountUnique() { return counts[UNIQUE_TERMS]; }; 00082 00084 int termCount(int termID) const; 00085 00087 int termCount() const { return counts[TOTAL_TERMS]; }; 00088 00090 float docLengthAvg(); 00091 00093 int docCount(int termID); 00094 00096 int docLength(DOCID_T docID) const; 00097 00099 int docLengthCounted(int docID); 00100 00102 00104 00105 00106 DocInfoList* docInfoList(int termID); 00107 00109 TermInfoList* termInfoList(int docID); 00110 00112 00114 void setMesgStream(ostream * lemStream); 00115 00116 protected: 00118 bool fullToc(const char* fileName); 00120 bool indexLookup(); 00122 bool invFileIDs(); 00124 bool docMgrIDs(); 00126 bool dtLookup(); 00128 bool dtLookup_ver1(); 00130 bool dtFileIDs(); 00132 bool termIDs(); 00134 bool docIDs(); 00135 00136 00137 int* counts; // array to hold all the overall count stats of this db 00138 char** names; // array to hold all the names for files we need for this db 00139 float aveDocLen; // the average document length in this index 00140 inv_entry* lookup; // the array holding entries (index is termid) 00141 dt_entry* dtlookup; // the array holding entries to dt index (index of array is docid) 00142 int dtloaded; // indicate load status of the dt index (loaded or not) 00143 TERM_T* terms; // array of the term spellings (index is termid) 00144 EXDOCID_T* docnames; // array of the external docids (index is docid) 00145 char** dtfiles; // array of dt index filenames 00146 char** invfiles; // array of inv index filenames 00147 vector<char*> docmgrs; // list of document managers 00148 map<TERM_T, TERMID_T, ltstr> termtable; // table of terms to termid 00149 map<EXDOCID_T, DOCID_T, ltstr> doctable; // table of exdocids to docid 00150 ostream* msgstream; // Lemur code messages stream 00151 }; 00152 00153 #endif