00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 #ifndef INDRI_TAGGEDTEXTPARSER_HPP
00020 #define INDRI_TAGGEDTEXTPARSER_HPP
00021 
00022 #include <stdio.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 #include <string>
00026 #include <vector>
00027 #include <map>
00028 #include "indri/HashTable.hpp"
00029 #include "indri/TagList.hpp"
00030 #include "indri/IndriParser.hpp"
00031 #include "indri/Buffer.hpp"
00032 #include "string-set.h"
00033 
00034 #define MAX_DOCNO_LENGTH 128
00035 #define PARSER_MAX_BUF_SIZE 1024
00036 
00037 class StringHash {
00038 public:
00039   int operator() (const std::string key) const {
00040     int hash = 0;
00041     for(unsigned int i = 0; i < key.length(); i++)
00042       hash += (unsigned char)key[i];
00043     return hash;
00044   }
00045 };
00046 
00047 class StringComparator {
00048 public:
00049   int operator() (const std::string one, const std::string two) const {
00050     return one.compare(two);
00051   }
00052 };
00053 
00054 class TaggedTextParser : public indri::Parser {
00055 public:
00056   TaggedTextParser();
00057   ~TaggedTextParser();
00058   
00059   void setTags( const std::vector<std::string>& include,
00060                 const std::vector<std::string>& exclude,
00061                 const std::vector<std::string>& index,
00062                 const std::vector<std::string>& metadata, 
00063                 const std::map<std::string,std::string>& conflations );
00064 
00065   ParsedDocument* parse( UnparsedDocument* document );
00066 
00067   void handle( UnparsedDocument* document );
00068   void setHandler( ObjectHandler<ParsedDocument>& h );
00069 
00070 protected:
00071   typedef HashTable<std::string, std::string, StringHash, StringComparator> StrHashTable;
00072 
00073   virtual void handleToken(char *token, int type, long pos);
00074   virtual void initialize( UnparsedDocument* unparsed, ParsedDocument* parsed );
00075   virtual void cleanup( UnparsedDocument* unparsed, ParsedDocument* parsed );
00076 
00077   void addTag(const char *s, const char* c, int pos) { tl->addTag(s, c, pos); }
00078   void endTag(const char *s, const char* c, int pos) { tl->endTag(s, c, pos); }
00079 
00080   void addMetadataTag(const char* s, const char* c, int pos) { _metaList->addTag(s, c, pos); }
00081   void endMetadataTag(const char* s, const char* c, int pos) { _metaList->endTag(s, c, pos); }
00082 
00083   
00084   TagList* tl;
00085   TagList* _metaList;
00086   Buffer _termBuffer;
00087 
00088   void writeToken(char *token);
00089 
00090   struct tag_properties {
00091     const char* name;
00092     const char* conflation;
00093     bool index;
00094     bool exclude;
00095     bool include;
00096     bool metadata;
00097   };
00098   tag_properties* _findTag(const char* name);
00099   tag_properties* _buildTag( const std::string& name, const std::map<std::string,std::string>& conflations );
00100   HashTable<const char*, tag_properties*> _tagTable;
00101   void handleTag(char* token, long pos);
00102 
00103   const tag_properties* _startExcludeRegion;
00104   const tag_properties* _startIncludeRegion;
00105   
00106   bool _exclude;
00107   bool _include;
00108   bool _defaultInclude;
00109   
00110 private:
00111   ObjectHandler<ParsedDocument>* _handler;
00112   ParsedDocument _document;
00113 
00114   void doParse();
00115   void writeToken(char *token, int start, int end);
00116   char start_tag[PARSER_MAX_BUF_SIZE];
00117   char end_tag[PARSER_MAX_BUF_SIZE];
00118 };
00119 
00120 namespace TaggedTextTokenType {
00121   const int tag = 1;
00122   const int upword = 2;
00123   const int word = 3;
00124   const int contraction = 4;
00125   const int acronym = 5;
00126   const int acronym2 = 6;
00127   const int unknown = 7;
00128 };
00129 
00130 #endif // INDRI_TAGGEDTEXTPARSER_HPP
00131 
00132 
00133 
00134 
00135