InvIndex.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 /*
00012   10/22/2002 -- dmf Add arrays dtfstreams and invfstreams to hold open
00013   ifstreams for the inverted list files so that each is opened and closed
00014   only once.
00015  */
00016 
00017 #ifndef _INVINDEX_HPP
00018 #define _INVINDEX_HPP
00019 
00021 #include "common_headers.hpp"
00022 #include "Index.hpp"
00023 #include "InvDocList.hpp"
00024 #include "InvTermList.hpp"
00025 #include "InvFPTypes.hpp"
00026 #include "Param.hpp"
00027 #include "DocMgrManager.hpp"
00028 
00029 // for counts array
00030 #define UNIQUE_TERMS 0
00031 #define TOTAL_TERMS  1
00032 #define DOCS         2
00033 #define DT_FILES     3
00034 #define INV_FILES    4
00035 
00036 #define NAMES_SIZE   8
00037 // for names array
00038 #define DOC_INDEX    0
00039 #define DOC_LOOKUP   1
00040 #define TERM_INDEX   2
00041 #define TERM_LOOKUP  3
00042 #define TERM_IDS     4
00043 #define DOC_IDS      5
00044 #define DOCMGR_IDS   6
00045 #define VERSION_NUM  7
00046 
00047 class InvIndex : public Index {
00048 public:
00049    InvIndex();
00050    InvIndex(const char* indexName);
00051   ~InvIndex(); 
00052 
00054 
00055 
00057   bool open(const char* indexName);
00059 
00061 
00062 
00064   int term(const char* word);
00065 
00067   const char* term(int termID);
00068 
00070   int document(const char* docIDStr);
00071 
00073   const char* document(int docID); 
00074 
00075   //  const char* docManager(int docID);
00076   DocumentManager* docManager(int docID);
00077 
00079 
00081 
00082 
00084   int docCount() { return counts[DOCS]; };
00085 
00087   int termCountUnique() { return counts[UNIQUE_TERMS]; };
00088 
00090   int termCount(int termID) const;
00091 
00093   int termCount() const { return counts[TOTAL_TERMS]; };
00094 
00096   float docLengthAvg();
00097 
00099   int docCount(int termID);
00100 
00102   int docLength(DOCID_T docID) const;
00103 
00105   virtual int docLengthCounted(int docID);
00106 
00108 
00110 
00111 
00112   DocInfoList* docInfoList(int termID);
00113 
00115   TermInfoList* termInfoList(int docID);
00116 
00118 
00120  void setMesgStream(ostream * lemStream);
00121 
00122 protected:
00124   bool fullToc(const char* fileName);
00126   bool indexLookup();
00128   bool invFileIDs();
00130   bool docMgrIDs();
00132   bool dtLookup();
00134   bool dtLookup_ver1();
00136   bool dtFileIDs();
00138   bool termIDs();
00140   bool docIDs();
00141 
00142 
00143   int* counts;    // array to hold all the overall count stats of this db
00144   char** names;   // array to hold all the names for files we need for this db
00145   float aveDocLen; // the average document length in this index
00146   inv_entry* lookup;  // the array holding entries (index is termid)
00147   dt_entry* dtlookup; // the array holding entries to dt index (index of array is docid)
00148   int dtloaded; // indicate load status of the dt index (loaded or not)
00149   TERM_T* terms;   // array of the term spellings (index is termid)
00150   EXDOCID_T* docnames; // array of the external docids (index is docid)
00151   char** dtfiles; // array of dt index filenames
00152   ifstream *dtfstreams; // array of dt index input streams
00153   char** invfiles; // array of inv index filenames
00154   ifstream *invfstreams; // array of inv index input streams
00155   //  vector<char*> docmgrs; // list of document managers
00156   vector<DocumentManager*> docmgrs; // list of document managers
00157   map<TERM_T, TERMID_T, ltstr> termtable; // table of terms to termid
00158   map<EXDOCID_T, DOCID_T, ltstr> doctable; // table of exdocids to docid
00159   ostream* msgstream; // Lemur code messages stream             
00160 };
00161 
00162 #endif