Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

HTMLParser.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // HTMLParser
00015 //
00016 // March 2004 -- metzler
00017 //
00018 
00019 #include "indri/TaggedTextParser.hpp"
00020 
00021 #ifndef MAX_URL_LENGTH
00022 #define MAX_URL_LENGTH 4096
00023 #endif
00024 
00025 class HTMLParser : public TaggedTextParser {
00026 public:
00027   HTMLParser() {
00028   }
00029   
00030   ~HTMLParser() { }
00031 
00032 protected:
00033   virtual void initialize( UnparsedDocument* unparsed, ParsedDocument* parsed );
00034   virtual void cleanup( UnparsedDocument* unparsed, ParsedDocument* parsed );
00035   virtual void handleToken(char *token, int type, long pos);
00036   char url[MAX_URL_LENGTH];
00037   char base_url[MAX_URL_LENGTH];
00038   
00039   bool normalizeURL(char *s);
00040 
00041 private:
00042   bool extractURL(char *token);
00043   void parseURL(char *url);
00044 };
00045 
00046 

Generated on Wed Nov 3 12:58:57 2004 for Lemur Toolkit by doxygen1.2.18