00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_INDEXENVIRONMENT_HPP
00020 #define INDRI_INDEXENVIRONMENT_HPP
00021
00022 #include <string>
00023 #include "indri/Parameters.hpp"
00024 #include "indri/HTMLParser.hpp"
00025 #include "indri/Repository.hpp"
00026 #include "indri/IndriParser.hpp"
00027 #include "indri/DocumentIterator.hpp"
00028 #include "indri/AnchorTextAnnotator.hpp"
00029 #include "indri/DocumentIteratorFactory.hpp"
00030 #include "indri/ParserFactory.hpp"
00031 #include "indri/FileClassEnvironmentFactory.hpp"
00032 #include <map>
00033
00034 struct IndexStatus {
00035 enum action_code {
00036 FileOpen,
00037 FileSkip,
00038 FileError,
00039 FileClose,
00040 DocumentCount
00041 };
00042
00043 virtual void operator () ( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {
00044 status( code, documentPath, error, documentsIndexed, documentsSeen );
00045 }
00046
00047 virtual void status( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {};
00048 };
00049
00055 class IndexEnvironment {
00056 private:
00057 IndexStatus* _callback;
00058 Parameters* _options;
00059
00060 std::string _repositoryPath;
00061 Repository _repository;
00062 int _documents;
00063 std::string _error;
00064
00065 std::string _anchorTextRoot;
00066 std::string _documentRoot;
00067
00068 Parameters _parameters;
00069 FileClassEnvironmentFactory _fileClassFactory;
00070
00071 AnchorTextAnnotator _annotator;
00072 std::map<std::string, FileClassEnvironment*> _environments;
00073
00074 int _documentsIndexed;
00075 int _documentsSeen;
00076
00077 void _getParsingContext( indri::Parser** parser,
00078 DocumentIterator** iterator,
00079 const std::string& extension );
00080
00081 public:
00082 IndexEnvironment();
00083 ~IndexEnvironment();
00087 void setAnchorTextPath( const std::string& documentRoot, const std::string& anchorTextRoot );
00101 void addFileClass( const std::string& name,
00102 const std::string& iterator,
00103 const std::string& parser,
00104 const std::string& startDocTag,
00105 const std::string& endDogTag,
00106 const std::string& endMetadataTag,
00107 const std::vector<std::string>& include,
00108 const std::vector<std::string>& exclude,
00109 const std::vector<std::string>& index,
00110 const std::vector<std::string>& metadata,
00111 const std::map<std::string,std::string>& conflations );
00114 void setIndexedFields( const std::vector<std::string>& fieldNames );
00115 void setNumericField( const std::string& fieldName, bool isNumeric );
00118 void setMetadataIndexedFields( const std::vector<std::string>& fieldNames );
00121 void setStopwords( const std::vector<std::string>& stopwords );
00124 void setStemmer( const std::string& stemmer );
00127 void setMemory( UINT64 memory );
00131 void create( const std::string& repositoryPath, IndexStatus* callback = 0 );
00135 void open( const std::string& repositoryPath, IndexStatus* callback = 0 );
00137 void close();
00140 void addFile( const std::string& fileName );
00144 void addFile( const std::string& fileName, const std::string& fileClass );
00149 void addString( const std::string& documentString, const std::string& fileClass, const std::vector<MetadataPair>& metadata );
00152 void addParsedDocument( ParsedDocument* document );
00153 };
00154
00155 #endif // INDRI_INDEXENVIRONMENT_HPP
00156