00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _BASICINDEX_HPP
00014 #define _BASICINDEX_HPP
00015
00017
00025
00026
00027
00028
00029
00030 #include <cstdio>
00031 #include <ctime>
00032 #include "common_headers.hpp"
00033 #include "Compress.hpp"
00034 #include "DocStream.hpp"
00035 #include "FastList.hpp"
00036 #include "IndexCount.hpp"
00037 #include "MemList.hpp"
00038 #include "String.hpp"
00039 #include "Terms.hpp"
00040 #include "Index.hpp"
00041 #include "DocStream.hpp"
00042
00043 class BasicIndex : public Index {
00044 public:
00046 BasicIndex();
00047
00049 BasicIndex (Compress * pc);
00050
00051 virtual ~BasicIndex();
00052
00054 virtual bool open(const string &indexName);
00055
00057
00058
00060 virtual int term (const string &word) const { return terms[word];}
00061
00063 virtual const string term (int termID) const { return terms[termID];}
00064
00066 virtual int document (const string &docIDStr) const {return docids[docIDStr];}
00067
00069 virtual const string document (int docID) const {return docids[docID];}
00070
00072 virtual const string termLexiconID() const { return wordVocabulary;}
00074
00076
00077
00079 virtual int docCount () const { return docids.size()-1;}
00080
00082 virtual int termCountUnique () const { return terms.size()-1;}
00083
00085 virtual int termCount (int termID) const { return countOfTerm[termID] ;}
00086
00088
00089 virtual int termCount () const {return numWords ;}
00090
00091
00093 virtual float docLengthAvg() const { return avgDocumentLength;}
00094
00096 virtual int docCount(int termID) const;
00097
00099 virtual int docLength (int docID) const { return countOfDoc[docID];} ;
00100
00102
00104
00105
00106 virtual DocInfoList *docInfoList(int termID) const ;
00107
00109 virtual TermInfoList *termInfoList(int docID) const ;
00110
00112
00113
00114
00115 void build(DocStream *collectionStream,
00116 const string &file,
00117 const string &outputPrefix,
00118 int totalDocs=0x1000000, int maxMemory=0x4000000,
00119 int minimumCount=1, int maxVocSize=2000000);
00120 private:
00121 void buildVocabulary(int maxVocSize, int minimumCount);
00122 void writeWordIndex(int indexNum, FastList<IndexCount> * dlw);
00123 int indexCollection();
00124 int headDocIndex();
00125 int headWordIndex();
00126 void createKeys();
00127 void mergeIndexFiles();
00128 void createKey(const string &inName, const string &outName,
00129 Terms & voc, int * byteOffset);
00130 int mergePair(const string &fn1, const string &fn2, const string &fn3);
00131 void writeIndexFile();
00132
00133 ifstream textStream;
00134
00135 String prefix;
00136 String textFile;
00137 String wordVocabulary;
00138 String documentVocabulary;
00139 String wordIndexFile, documentIndexFile;
00140 String wordKeyFile, documentKeyFile;
00141 Terms terms;
00142 Terms docids;
00143 int numDocuments;
00144 int numWords;
00145 int numBytes;
00146 int maxDocumentLength;
00147 float avgDocumentLength;
00148 int totalDocuments;
00149
00150
00151 private:
00152 int memorySegment;
00153 int maxSegmentsPerIndex;
00154 time_t timeToIndex;
00155 int maximumMemory;
00156
00157 MemList * pMemList;
00158 Compress * pCompressor;
00159 bool deleteCompressor;
00160 DocStream * pDocStream;
00161
00162 private:
00163
00164 mutable ifstream wordIndexStream, documentIndexStream;
00165 int * woffset, * doffset;
00166 int * tmpdarr, * tmpwarr;
00167 int * countOfTerm;
00168 int * countOfDoc;
00169
00170 };
00171
00172 #endif