Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

InvIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _INVINDEX_HPP
00014 #define _INVINDEX_HPP
00015 
00017 #include "common_headers.hpp"
00018 #include "Index.hpp"
00019 #include "InvDocList.hpp"
00020 #include "InvTermList.hpp"
00021 #include "InvFPTypes.hpp"
00022 #include "Param.hpp"
00023 
00024 // for counts array
00025 #define UNIQUE_TERMS 0
00026 #define TOTAL_TERMS  1
00027 #define DOCS         2
00028 #define DT_FILES     3
00029 #define INV_FILES    4
00030 
00031 #define NAMES_SIZE   8
00032 // for names array
00033 #define DOC_INDEX    0
00034 #define DOC_LOOKUP   1
00035 #define TERM_INDEX   2
00036 #define TERM_LOOKUP  3
00037 #define TERM_IDS     4
00038 #define DOC_IDS      5
00039 #define DOCMGR_IDS   6
00040 #define VERSION_NUM  7
00041 
00042 class InvIndex : public Index {
00043 public:
00044    InvIndex();
00045    InvIndex(const char* indexName);
00046   ~InvIndex(); 
00047 
00049 
00050 
00052   bool open(const char* indexName);
00054 
00056 
00057 
00059   int term(const char* word);
00060 
00062   const char* term(int termID);
00063 
00065   int document(const char* docIDStr);
00066 
00068   const char* document(int docID); 
00069 
00070   const char* docManager(int docID);
00071 
00073 
00075 
00076 
00078   int docCount() { return counts[DOCS]; };
00079 
00081   int termCountUnique() { return counts[UNIQUE_TERMS]; };
00082 
00084   int termCount(int termID) const;
00085 
00087   int termCount() const { return counts[TOTAL_TERMS]; };
00088 
00090   float docLengthAvg();
00091 
00093   int docCount(int termID);
00094 
00096   int docLength(DOCID_T docID) const;
00097 
00099   int docLengthCounted(int docID);
00100 
00102 
00104 
00105 
00106   DocInfoList* docInfoList(int termID);
00107 
00109   TermInfoList* termInfoList(int docID);
00110 
00112 
00114  void setMesgStream(ostream * lemStream);
00115 
00116 protected:
00118   bool fullToc(const char* fileName);
00120   bool indexLookup();
00122   bool invFileIDs();
00124   bool docMgrIDs();
00126   bool dtLookup();
00128   bool dtLookup_ver1();
00130   bool dtFileIDs();
00132   bool termIDs();
00134   bool docIDs();
00135 
00136 
00137   int* counts;    // array to hold all the overall count stats of this db
00138   char** names;   // array to hold all the names for files we need for this db
00139   float aveDocLen; // the average document length in this index
00140   inv_entry* lookup;  // the array holding entries (index is termid)
00141   dt_entry* dtlookup; // the array holding entries to dt index (index of array is docid)
00142   int dtloaded; // indicate load status of the dt index (loaded or not)
00143   TERM_T* terms;   // array of the term spellings (index is termid)
00144   EXDOCID_T* docnames; // array of the external docids (index is docid)
00145   char** dtfiles; // array of dt index filenames
00146   char** invfiles; // array of inv index filenames
00147   vector<char*> docmgrs; // list of document managers
00148   map<TERM_T, TERMID_T, ltstr> termtable; // table of terms to termid
00149   map<EXDOCID_T, DOCID_T, ltstr> doctable; // table of exdocids to docid
00150   ostream* msgstream; // Lemur code messages stream             
00151 };
00152 
00153 #endif

Generated on Mon Sep 30 14:13:22 2002 for LEMUR by doxygen1.2.18