#include <KeyfileIncIndex.hpp>
Inheritance diagram for KeyfileIncIndex:

Public Methods | |
| KeyfileIncIndex (const string &prefix, int cachesize=128000000, DOCID_T startdocid=1) | |
| KeyfileIncIndex () | |
| New empty one for index manager to use. | |
| ~KeyfileIncIndex () | |
| Clean up. | |
| void | setName (const string &prefix) |
| sets the name for this index | |
| bool | beginDoc (const DocumentProps *dp) |
| the beginning of a new document | |
| bool | addTerm (const Term &t) |
| adding a term to the current document | |
| void | endDoc (const DocumentProps *dp) |
| signify the end of current document | |
| virtual void | endDoc (const DocumentProps *dp, const string &mgr) |
| signify the end of current document | |
| void | endCollection (const CollectionProps *cp) |
| signify the end of this collection. | |
| void | setDocManager (const string &mgrID) |
| set the document manager to use for succeeding documents | |
| void | setMesgStream (ostream *lemStream) |
| set the mesg stream | |
| void | addKnownTerm (int termID, int position) |
| update data for an already seen term | |
| int | addUnknownTerm (const InvFPTerm *term) |
| initialize data for a previously unseen term. | |
| int | addUncachedTerm (const InvFPTerm *term) |
| update data for a term that is not cached in the term cache. | |
Open index | |
| bool | open (const string &indexName) |
| Open previously created Index with given prefix. | |
Spelling and index conversion | |
| int | term (const string &word) const |
| Convert a term spelling to a termID. | |
| const string | term (int termID) const |
| Convert a termID to its spelling. | |
| int | document (const string &docIDStr) const |
| Convert a spelling to docID. | |
| const string | document (int docID) const |
| Convert a docID to its spelling. | |
| const DocumentManager * | docManager (int docID) const |
| The document manager for this document. | |
Summary counts | |
| int | docCount () const |
| Total count (i.e., number) of documents in collection. | |
| int | termCountUnique () const |
| Total count of unique terms in collection. | |
| int | termCount (int termID) const |
| Total counts of a term in collection. | |
| int | termCount () const |
| Total counts of all terms in collection. | |
| float | docLengthAvg () const |
| Average document length. | |
| int | docCount (int termID) const |
| Total counts of doc with a given term. | |
| int | docLength (DOCID_T docID) const |
| Total counts of terms in a document, including stop words maybe. | |
| virtual int | totaldocLength (int docID) const |
| Total counts of terms in a document including stopwords for sure. | |
| int | docLengthCounted (int docID) const |
| Total count of terms in given document, not including stop words. | |
Index entry access | |
| DocInfoList * | docInfoList (int termID) const |
doc entries in a term index,
| |
| TermInfoList * | termInfoList (int docID) const |
word entries in a document index (bag of words),
| |
| TermInfoList * | termInfoListSeq (int docID) const |
word entries in a document index (sequence of words),
| |
Protected Methods | |
| bool | tryOpen () |
| try to open an existing index | |
| void | writeTOC () |
| write out the table of contents file. | |
| void | writeCache (bool lastRun=false) |
| write out the cache | |
| void | lastWriteCache () |
| final run write out of cache | |
| void | mergeCacheSegments () |
| out-of-tree cache management combine segments into single segment | |
| void | writeCacheSegment () |
| write out segments | |
| void | writeDocMgrIDs () |
| write out document manager ids | |
| int | docMgrID (const string &mgr) |
| virtual void | doendDoc (const DocumentProps *dp, int mgrid) |
| handle end of document token. | |
| void | openDBs () |
| open the database files | |
| void | openSegments () |
| open the segment files | |
| void | createDBs () |
| create the database files | |
| void | fullToc () |
| readin all toc | |
| bool | docMgrIDs () |
| read in document manager internal and external ids map | |
| record | fetchDocumentRecord (int key) const |
| retrieve a document record. | |
| void | addDocumentLookup (int documentKey, const char *documentName) |
| store a document record | |
| void | addTermLookup (int termKey, const char *termSpelling) |
| store a term record | |
| void | addGeneralLookup (Keyfile &numberNameIndex, Keyfile &nameNumberIndex, int number, const char *name) |
| store a record | |
| InvFPDocList * | internalDocInfoList (int termID) const |
| retrieve and construct the DocInfoList for a term. | |
| void | _updateTermlist (InvFPDocList *curlist, int position) |
| add a position to a DocInfoList | |
| int | _cacheSize () |
| total memory used by cache | |
| void | _computeMemoryBounds (int memorySize) |
| cache size limits based on cachesize parameter to constructor | |
| void | _resetEstimatePoint () |
| Approximate how many updates to collect before flushing the cache. | |
Protected Attributes | |
| int | listlengths |
| how long all the lists are | |
| int * | counts |
| array to hold all the overall count stats of this db | |
| std::vector< std::string > | names |
| array to hold all the names for files we need for this db | |
| float | aveDocLen |
| the average document length in this index | |
| vector< std::string > | docmgrs |
| list of document managers | |
| ostream * | msgstream |
| Lemur code messages stream. | |
| Keyfile | invlookup |
| termID -> TermData (term statistics and inverted list segment offsets) | |
| Keyfile | dIDs |
| documentName -> documentID | |
| Keyfile | dSTRs |
| documentID -> documentName | |
| Keyfile | tIDs |
| termName -> termID | |
| Keyfile | tSTRs |
| termID -> termName | |
| File | dtlookup |
| document statistics (document length, etc.) | |
| ReadBuffer * | dtlookupReadBuffer |
| read buffer for dtlookup | |
| File | writetlist |
| char | termKey [MAX_TERM_LENGTH] |
| buffers for term() lookup functions | |
| char | docKey [MAX_DOCID_LENGTH] |
| buffers for document() lookup functions | |
| int | _listsSize |
| memory for use by inverted list buffers | |
| int | _memorySize |
| upper bound for memory use | |
| std::string | name |
| the prefix name | |
| vector< InvFPDocList * > | invertlists |
| array of pointers to doclists | |
| vector< LocatedTerm > | termlist |
| list of terms and their locations in this document | |
| int | curdocmgr |
| the current docmanager to use | |
| vector< DocumentManager * > | docMgrs |
| list of document manager objects | |
| TermCache | _cache |
| cache of term entries | |
| std::vector< File * > | _segments |
| out-of-tree segments for data | |
| int | _largestFlushedTermID |
| highest term id flushed to disk. | |
| int | _estimatePoint |
| invertlists point where we should next check on the cache size | |
| bool | ignoreDoc |
| are we in a bad document state? | |
|
||||||||||||||||
|
Instantiate with index name without extension. Optionally pass in cachesize and starting document id number. |
|
|
New empty one for index manager to use.
|
|
|
Clean up.
|
|
|
total memory used by cache
|
|
|
cache size limits based on cachesize parameter to constructor
|
|
|
Approximate how many updates to collect before flushing the cache.
|
|
||||||||||||
|
add a position to a DocInfoList
|
|
||||||||||||
|
store a document record
|
|
||||||||||||||||||||
|
store a record
|
|
||||||||||||
|
update data for an already seen term
|
|
|
adding a term to the current document
Implements PushIndex. |
|
||||||||||||
|
store a term record
|
|
|
update data for a term that is not cached in the term cache.
|
|
|
initialize data for a previously unseen term.
|
|
|
the beginning of a new document
Implements PushIndex. |
|
|
create the database files
|
|
|
Total counts of doc with a given term.
Implements Index. |
|
|
Total count (i.e., number) of documents in collection.
Implements Index. |
|
|
doc entries in a term index,
Implements Index. |
|
|
Total counts of terms in a document, including stop words maybe.
|
|
|
Average document length.
Implements Index. |
|
|
Total count of terms in given document, not including stop words.
|
|
|
The document manager for this document.
Reimplemented from Index. |
|
|
returns the internal id of given docmgr if not already registered, mgr will be added |
|
|
read in document manager internal and external ids map
|
|
|
Convert a docID to its spelling.
Implements Index. |
|
|
Convert a spelling to docID.
Implements Index. |
|
||||||||||||
|
handle end of document token.
|
|
|
signify the end of this collection.
Implements PushIndex. |
|
||||||||||||
|
signify the end of current document
|
|
|
signify the end of current document
Implements PushIndex. |
|
|
retrieve a document record.
|
|
|
readin all toc
|
|
|
retrieve and construct the DocInfoList for a term.
|
|
|
final run write out of cache
|
|
|
out-of-tree cache management combine segments into single segment
|
|
|
Open previously created Index with given prefix.
Implements Index. |
|
|
open the database files
|
|
|
open the segment files
|
|
|
set the document manager to use for succeeding documents
Implements PushIndex. |
|
|
set the mesg stream
|
|
|
sets the name for this index
|
|
|
Convert a termID to its spelling.
Implements Index. |
|
|
Convert a term spelling to a termID.
Implements Index. |
|
|
Total counts of all terms in collection.
Implements Index. |
|
|
Total counts of a term in collection.
Implements Index. |
|
|
Total count of unique terms in collection.
Implements Index. |
|
|
word entries in a document index (bag of words),
Implements Index. |
|
|
word entries in a document index (sequence of words),
Reimplemented from Index. |
|
|
Total counts of terms in a document including stopwords for sure.
|
|
|
try to open an existing index
|
|
|
write out the cache
|
|
|
write out segments
|
|
|
write out document manager ids
|
|
|
write out the table of contents file.
|
|
|
cache of term entries
|
|
|
invertlists point where we should next check on the cache size
|
|
|
highest term id flushed to disk.
|
|
|
memory for use by inverted list buffers
|
|
|
upper bound for memory use
|
|
|
out-of-tree segments for data
|
|
|
the average document length in this index
|
|
|
array to hold all the overall count stats of this db
|
|
|
the current docmanager to use
|
|
|
documentName -> documentID
|
|
|
buffers for document() lookup functions
|
|
|
list of document manager objects
|
|
|
list of document managers
|
|
|
documentID -> documentName
|
|
|
document statistics (document length, etc.)
|
|
|
read buffer for dtlookup
|
|
|
are we in a bad document state?
|
|
|
array of pointers to doclists
|
|
|
termID -> TermData (term statistics and inverted list segment offsets)
|
|
|
how long all the lists are
|
|
|
Lemur code messages stream.
|
|
|
the prefix name
|
|
|
array to hold all the names for files we need for this db
|
|
|
buffers for term() lookup functions
|
|
|
list of terms and their locations in this document
|
|
|
termName -> termID
|
|
|
termID -> termName
|
|
|
filestream for writing the list of located terms mutable for index access mode of Index API (not PushIndex) |
1.2.18