00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_TERMDATA_HPP
00020 #define INDRI_TERMDATA_HPP
00021
00022 #include "indri/TermFieldStatistics.hpp"
00023 #include <indri/greedy_vector>
00024 #include "indri/DocListMemoryBuilder.hpp"
00025 #include "File.hpp"
00026 #include "lemur-compat.hpp"
00027 #include "indri/RVLCompressStream.hpp"
00028 #include "indri/RVLDecompressStream.hpp"
00029
00030 #ifdef WIN32
00031
00032 #pragma warning ( disable: 4200 )
00033 #endif
00034
00035 #define INDRI_MAX_SEGMENTS (8)
00036
00037 namespace indri {
00038 namespace index {
00039 struct TermData {
00040 private:
00041
00042
00043
00044 TermData( const TermData& other ) {}
00045 const TermData& operator= ( const TermData& other ) { return *this; }
00046
00047 public:
00048 TermData() :
00049 maxDocumentFrequency(0),
00050 maxDocumentFraction(0),
00051 minDocumentLength(MAX_INT32)
00052 {
00053 term = 0;
00054
00055 memset( segmentOffsets, 0xFF, sizeof segmentOffsets );
00056 }
00057
00058 File::offset_type segmentOffsets[INDRI_MAX_SEGMENTS];
00059 TermFieldStatistics corpus;
00060 DocListMemoryBuilder list;
00061
00062 float maxDocumentFraction;
00063 unsigned int maxDocumentFrequency;
00064 unsigned int minDocumentLength;
00065
00066 const char* term;
00067
00068 TermFieldStatistics fields[0];
00069 };
00070 }
00071 }
00072
00073 inline indri::index::TermData* termdata_create( int fieldCount ) {
00074
00075 void* buffer = malloc( sizeof(indri::index::TermData) + sizeof(indri::index::TermFieldStatistics)*fieldCount );
00076
00077
00078 new(buffer) indri::index::TermData();
00079
00080
00081 for( int i=0; i<fieldCount; i++ ) {
00082 new((char*)buffer +
00083 sizeof(indri::index::TermData) +
00084 sizeof(indri::index::TermFieldStatistics)*i) indri::index::TermFieldStatistics();
00085 }
00086
00087 return (indri::index::TermData*) buffer;
00088 }
00089
00090 inline void termdata_delete( indri::index::TermData* termData, int fieldCount ) {
00091 if( termData ) {
00092 termData->~TermData();
00093
00094 for( int i=0; i<fieldCount; i++ ) {
00095 termData->fields[i].~TermFieldStatistics();
00096 }
00097
00098 free(termData);
00099 }
00100 }
00101
00102 inline int termdata_size( int fieldCount ) {
00103 return sizeof(indri::index::TermData) + fieldCount * sizeof(indri::index::TermFieldStatistics);
00104 }
00105
00106 inline int termdata_compress( char* buffer, int size, int fieldCount, indri::index::TermData* termData ) {
00107 RVLCompressStream stream( buffer, size );
00108
00109
00110 stream << termData->corpus.totalCount
00111 << termData->corpus.documentCount;
00112
00113
00114 stream << termData->maxDocumentFrequency
00115 << termData->minDocumentLength
00116 << termData->maxDocumentFraction;
00117
00118
00119
00120 int numSegments = 0;
00121
00122
00123 for( size_t i=0; i<INDRI_MAX_SEGMENTS; i++ ) {
00124 if( termData->segmentOffsets[i] != MAX_INT64 )
00125 numSegments++;
00126 }
00127
00128 stream << numSegments;
00129
00130
00131 for( unsigned int i=0; i<INDRI_MAX_SEGMENTS; i++ ) {
00132 if( termData->segmentOffsets[i] != MAX_INT64 ) {
00133 stream << i
00134 << termData->segmentOffsets[i];
00135 }
00136 }
00137
00138
00139 for( int i=0; i<fieldCount; i++ ) {
00140 stream << termData->fields[i].totalCount
00141 << termData->fields[i].documentCount;
00142 }
00143
00144 return stream.dataSize();
00145 }
00146
00147 inline void termdata_decompress( const char* buffer, int size, int fieldCount, indri::index::TermData* termData ) {
00148 RVLDecompressStream stream( buffer, size );
00149
00150
00151 stream >> termData->corpus.totalCount
00152 >> termData->corpus.documentCount;
00153
00154
00155 stream >> termData->maxDocumentFrequency
00156 >> termData->minDocumentLength
00157 >> termData->maxDocumentFraction;
00158
00159
00160 int numSegments = 0;
00161 stream >> numSegments;
00162
00163 for( int i=0; i<numSegments; i++ ) {
00164 int segment;
00165 File::offset_type offset;
00166
00167 stream >> segment
00168 >> offset;
00169
00170 termData->segmentOffsets[segment] = offset;
00171 }
00172
00173
00174 for( int i=0; i<fieldCount; i++ ) {
00175 stream >> termData->fields[i].totalCount
00176 >> termData->fields[i].documentCount;
00177 }
00178 }
00179
00180 #endif // INDRI_TERMDATA_HPP