00001 /*========================================================================== 00002 * Copyright (c) 2000-2004 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software (and below), and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _INDRITEXTHANDLER_HPP 00013 #define _INDRITEXTHANDLER_HPP 00014 00015 #include "Parser.hpp" 00016 #include "indri/ParsedDocument.hpp" 00017 #include "indri/IndexEnvironment.hpp" 00018 00024 #define DOCIDKEY "docno" 00025 00026 class IndriTextHandler : public TextHandler { 00027 00028 public: 00031 IndriTextHandler(const string &name, int memory, const Parser* p); 00032 ~IndriTextHandler(); 00033 00035 char * handleDoc(char * docno); 00037 void handleEndDoc(); 00039 char * handleWord(char * word, const char* original, PropertyList* list); 00040 char * handleBeginTag(char* tag, const char* orig, PropertyList* props); 00041 char * handleEndTag(char* tag, const char* orig, PropertyList* props); 00042 00043 protected: 00045 IndexEnvironment env; 00047 ParsedDocument document; 00049 MetadataPair docid; 00051 char* curdocno; 00053 char* docsource; 00055 int bufsize; 00057 int docbegin; 00059 const Parser* parser; 00061 }; 00062 00063 #endif 00064