BasicIndex.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.cs.cmu.edu/~lemur/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _BASICINDEX_HPP
00014 #define _BASICINDEX_HPP
00015 
00017 
00025 //--------------------------------------------------------
00026 // originally written by J. Lafferty, 11/00
00027 //-----------------------------------------------------------------
00028 
00029 
00030 #include <cstdio>
00031 #include <ctime>
00032 #include "common_headers.hpp"
00033 #include "Compress.hpp"
00034 #include "DocStream.hpp"
00035 #include "FastList.hpp"
00036 #include "IndexCount.hpp"
00037 #include "MemList.hpp"
00038 #include "String.hpp"
00039 #include "Terms.hpp"
00040 #include "Index.hpp"
00041 #include "DocStream.hpp"
00042 
00043 class BasicIndex : public Index {
00044 public:
00046   BasicIndex();
00047 
00049   BasicIndex (Compress * pc);
00050 
00051   virtual ~BasicIndex();
00052   
00054   virtual bool open(const char * indexName);
00055 
00057 
00058 
00060   virtual int term (const char * word) { return terms[word];}
00061 
00063   virtual const char * term (int termID) { return terms[termID];} 
00064 
00066   virtual int document (const char * docIDStr) {return docids[docIDStr];}
00067 
00069   virtual const char * document (int docID) {return docids[docID];}
00070 
00072   virtual const char *termLexiconID() { return wordVocabulary;} 
00074 
00076 
00077 
00079   virtual int docCount ()  { return docids.size()-1;}
00080 
00082   virtual int termCountUnique () { return terms.size()-1;}
00083 
00085   virtual int termCount (int termID) const { return countOfTerm[termID] ;}
00086 
00088   //  virtual int termCount () const {return (int)avgDocumentLength*(docids.size()-1) ;}
00089   virtual int termCount () const {return numWords ;}
00090   // XXX Better to have an exact count!
00091 
00093   virtual float docLengthAvg() { return avgDocumentLength;}
00094 
00096   virtual int docCount(int termID);
00097 
00099   virtual int docLength (int docID) const { return countOfDoc[docID];}  ;
00100 
00102 
00104 
00105 
00106   virtual DocInfoList *docInfoList(int termID) ;
00107 
00109   virtual TermInfoList *termInfoList(int docID) ;
00110 
00112 
00113 
00114   // Create basic inverted indices 
00115   void build(DocStream *collectionStream,
00116              const char *file,
00117              const char * outputPrefix, 
00118              int totalDocs=0x1000000, int maxMemory=0x4000000,
00119              int minimumCount=1, int maxVocSize=2000000);
00120 private:
00121   void buildVocabulary(int maxVocSize, int minimumCount);
00122   void writeWordIndex(int indexNum, FastList<IndexCount> * dlw);
00123   int indexCollection();
00124   int headDocIndex();
00125   int headWordIndex();
00126   void createKeys();
00127   void mergeIndexFiles();
00128   void createKey(const char * inName, const char * outName, Terms & voc, int * byteOffset);
00129   int mergePair(const char * fn1, const char * fn2, const char * fn3);
00130   void writeIndexFile();
00131 
00132   ifstream textStream;
00133 
00134   String      prefix;
00135   String      textFile;
00136   String      wordVocabulary;
00137   String      documentVocabulary;
00138   String      wordIndexFile, documentIndexFile;
00139   String      wordKeyFile, documentKeyFile;
00140   Terms       terms;
00141   Terms       docids;
00142   int         numDocuments;
00143   int         numWords;
00144   int         numBytes;
00145   int         maxDocumentLength;
00146   float       avgDocumentLength;
00147   int         totalDocuments;
00148 
00149   // The following are "utility" variables
00150 private:
00151   int         memorySegment;
00152   int         maxSegmentsPerIndex;
00153   time_t      timeToIndex;
00154   int         maximumMemory;
00155   // int *       byteOffsetOfDoc;
00156   MemList *   pMemList;  
00157   Compress *  pCompressor;
00158   bool deleteCompressor;
00159   DocStream * pDocStream;
00160 
00161 private:
00162   // fields for managing indices
00163   ifstream  wordIndexStream, documentIndexStream;
00164   int * woffset, * doffset; //  *toffset;
00165   int * tmpdarr, * tmpwarr;
00166   int * countOfTerm;
00167   int * countOfDoc;
00168 
00169 };
00170 
00171 #endif