Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

IndexEnvironment.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // IndexEnvironment
00015 //
00016 // 19 July 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_INDEXENVIRONMENT_HPP
00020 #define INDRI_INDEXENVIRONMENT_HPP
00021 
00022 #include <string>
00023 #include "indri/Parameters.hpp"
00024 #include "indri/HTMLParser.hpp"
00025 #include "indri/Repository.hpp"
00026 #include "indri/IndriParser.hpp"
00027 #include "indri/DocumentIterator.hpp"
00028 #include "indri/AnchorTextAnnotator.hpp"
00029 #include "indri/DocumentIteratorFactory.hpp"
00030 #include "indri/ParserFactory.hpp"
00031 #include "indri/FileClassEnvironmentFactory.hpp"
00032 #include <map>
00033 
00034 struct IndexStatus {
00035   enum action_code {
00036     FileOpen,
00037     FileSkip,
00038     FileError,
00039     FileClose,
00040     DocumentCount
00041   };
00042 
00043   virtual void operator () ( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {
00044     status( code, documentPath, error, documentsIndexed, documentsSeen );
00045   }
00046 
00047   virtual void status( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {};
00048 };
00049 
00055 class IndexEnvironment {
00056 private:
00057   IndexStatus* _callback;
00058   Parameters* _options;
00059 
00060   std::string _repositoryPath;
00061   Repository _repository;
00062   int _documents;
00063   std::string _error;
00064 
00065   std::string _anchorTextRoot;
00066   std::string _documentRoot;
00067 
00068   Parameters _parameters;
00069   FileClassEnvironmentFactory _fileClassFactory;
00070 
00071   AnchorTextAnnotator _annotator;
00072   std::map<std::string, FileClassEnvironment*> _environments;
00073 
00074   int _documentsIndexed;
00075   int _documentsSeen;
00076 
00077   void _getParsingContext( indri::Parser** parser,
00078                            DocumentIterator** iterator,
00079                            const std::string& extension );
00080 
00081 public:
00082   IndexEnvironment();
00083   ~IndexEnvironment();
00087   void setAnchorTextPath( const std::string& documentRoot, const std::string& anchorTextRoot );
00101   void addFileClass( const std::string& name, 
00102                      const std::string& iterator,
00103                      const std::string& parser,
00104                      const std::string& startDocTag,
00105                      const std::string& endDogTag,
00106                      const std::string& endMetadataTag,
00107                      const std::vector<std::string>& include,
00108                      const std::vector<std::string>& exclude,
00109                      const std::vector<std::string>& index,
00110                      const std::vector<std::string>& metadata, 
00111                      const std::map<std::string,std::string>& conflations );
00114   void setIndexedFields( const std::vector<std::string>& fieldNames );
00115   void setNumericField( const std::string& fieldName, bool isNumeric );
00118   void setMetadataIndexedFields( const std::vector<std::string>& fieldNames );
00121   void setStopwords( const std::vector<std::string>& stopwords );
00124   void setStemmer( const std::string& stemmer );
00127   void setMemory( UINT64 memory );
00131   void create( const std::string& repositoryPath, IndexStatus* callback = 0 );
00135   void open( const std::string& repositoryPath, IndexStatus* callback = 0 );
00137   void close();
00140   void addFile( const std::string& fileName );
00144   void addFile( const std::string& fileName, const std::string& fileClass );
00149   void addString( const std::string& documentString, const std::string& fileClass, const std::vector<MetadataPair>& metadata );
00152   void addParsedDocument( ParsedDocument* document );
00153 };
00154 
00155 #endif // INDRI_INDEXENVIRONMENT_HPP
00156 

Generated on Wed Nov 3 12:58:57 2004 for Lemur Toolkit by doxygen1.2.18