TermData.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // TermData.hpp
00015 //
00016 // 4 February 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_TERMDATA_HPP
00020 #define INDRI_TERMDATA_HPP
00021 
00022 #include "indri/TermFieldStatistics.hpp"
00023 #include <indri/greedy_vector>
00024 #include "indri/DocListMemoryBuilder.hpp"
00025 #include "File.hpp"
00026 #include "lemur-compat.hpp"
00027 #include "indri/RVLCompressStream.hpp"
00028 #include "indri/RVLDecompressStream.hpp"
00029 
00030 #ifdef WIN32
00031 // remove warning about zero-sized arrays
00032 #pragma warning ( disable: 4200 )
00033 #endif 
00034 
00035 #define INDRI_MAX_SEGMENTS (8)
00036 
00037 namespace indri {
00038   namespace index {
00039     struct TermData {
00040     private:
00041       // these are private, bogus functions so that this object can never be copied
00042       // we don't want to be able to copy it, because any real copy operator needs to
00043       // take into account the data in the fields[] array, and we don't know how long it is.
00044       TermData( const TermData& other ) {}
00045       const TermData& operator= ( const TermData& other ) { return *this; }
00046 
00047     public:
00048       TermData() :
00049           maxDocumentFrequency(0),
00050           maxDocumentFraction(0),
00051           minDocumentLength(MAX_INT32)
00052       {
00053         term = 0;
00054         
00055         memset( segmentOffsets, 0xFF, sizeof segmentOffsets );
00056       }
00057 
00058       File::offset_type segmentOffsets[INDRI_MAX_SEGMENTS];
00059       TermFieldStatistics corpus;
00060       DocListMemoryBuilder list;
00061 
00062       float maxDocumentFraction;         // argmax_documents of (termCount/docLength)
00063       unsigned int maxDocumentFrequency; // maximum number of times this term appears in any given document
00064       unsigned int minDocumentLength;    // minimum length of any document that contains this term
00065 
00066       const char* term;                  // name of this term
00067 
00068       TermFieldStatistics fields[0];
00069     };
00070   }
00071 }
00072 
00073 inline indri::index::TermData* termdata_create( int fieldCount ) {
00074   // allocate enough room for the term data, plus enough room for fields
00075   void* buffer = malloc( sizeof(indri::index::TermData) + sizeof(indri::index::TermFieldStatistics)*fieldCount );
00076   
00077   // call the constructor in place
00078   new(buffer) indri::index::TermData();
00079 
00080   // call field data constructors in place
00081   for( int i=0; i<fieldCount; i++ ) {
00082     new((char*)buffer +
00083         sizeof(indri::index::TermData) +
00084         sizeof(indri::index::TermFieldStatistics)*i) indri::index::TermFieldStatistics();
00085   }
00086 
00087   return (indri::index::TermData*) buffer;
00088 }
00089 
00090 inline void termdata_delete( indri::index::TermData* termData, int fieldCount ) {
00091   if( termData ) {
00092     termData->~TermData();
00093 
00094     for( int i=0; i<fieldCount; i++ ) {
00095       termData->fields[i].~TermFieldStatistics();
00096     }
00097 
00098     free(termData);
00099   }
00100 }
00101 
00102 inline int termdata_size( int fieldCount ) {
00103   return sizeof(indri::index::TermData) + fieldCount * sizeof(indri::index::TermFieldStatistics);
00104 }
00105 
00106 inline int termdata_compress( char* buffer, int size, int fieldCount, indri::index::TermData* termData ) {
00107   RVLCompressStream stream( buffer, size );
00108 
00109   // corpus statistics
00110   stream << termData->corpus.totalCount
00111          << termData->corpus.documentCount;
00112 
00113   // max-score statistics
00114   stream << termData->maxDocumentFrequency
00115          << termData->minDocumentLength
00116          << termData->maxDocumentFraction;
00117 
00118   // segment information
00119   
00120   int numSegments = 0;
00121 
00122   // count up the number of segments used here
00123   for( size_t i=0; i<INDRI_MAX_SEGMENTS; i++ ) {
00124     if( termData->segmentOffsets[i] != MAX_INT64 )
00125       numSegments++;
00126   }
00127 
00128   stream << numSegments;
00129 
00130   // stream out only the segment offsets that are used
00131   for( unsigned int i=0; i<INDRI_MAX_SEGMENTS; i++ ) {
00132     if( termData->segmentOffsets[i] != MAX_INT64 ) {
00133       stream << i
00134             << termData->segmentOffsets[i];
00135     }
00136   }
00137 
00138   // field statistics
00139   for( int i=0; i<fieldCount; i++ ) {
00140     stream << termData->fields[i].totalCount
00141           << termData->fields[i].documentCount;
00142   }
00143 
00144   return stream.dataSize();
00145 }
00146 
00147 inline void termdata_decompress( const char* buffer, int size, int fieldCount, indri::index::TermData* termData ) {
00148   RVLDecompressStream stream( buffer, size );
00149   
00150   // corpus statistics
00151   stream >> termData->corpus.totalCount
00152         >> termData->corpus.documentCount;
00153 
00154   // max-score statistics
00155   stream >> termData->maxDocumentFrequency
00156         >> termData->minDocumentLength
00157         >> termData->maxDocumentFraction;
00158 
00159   // segment information
00160   int numSegments = 0;
00161   stream >> numSegments;
00162 
00163   for( int i=0; i<numSegments; i++ ) {
00164     int segment;
00165     File::offset_type offset;
00166 
00167     stream >> segment
00168           >> offset;
00169 
00170     termData->segmentOffsets[segment] = offset;
00171   }
00172 
00173   // field statistics
00174   for( int i=0; i<fieldCount; i++ ) {
00175     stream >> termData->fields[i].totalCount
00176           >> termData->fields[i].documentCount;
00177   }
00178 }
00179 
00180 #endif // INDRI_TERMDATA_HPP