00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // HTMLParser 00015 // 00016 // March 2004 -- metzler 00017 // 00018 00019 #include "indri/TaggedTextParser.hpp" 00020 00021 #ifndef MAX_URL_LENGTH 00022 #define MAX_URL_LENGTH 4096 00023 #endif 00024 00025 class HTMLParser : public TaggedTextParser { 00026 public: 00027 HTMLParser() { 00028 } 00029 00030 ~HTMLParser() { } 00031 00032 protected: 00033 virtual void initialize( UnparsedDocument* unparsed, ParsedDocument* parsed ); 00034 virtual void cleanup( UnparsedDocument* unparsed, ParsedDocument* parsed ); 00035 virtual void handleToken(char *token, int type, long pos); 00036 char url[MAX_URL_LENGTH]; 00037 char base_url[MAX_URL_LENGTH]; 00038 00039 bool normalizeURL(char *s); 00040 00041 private: 00042 bool extractURL(char *token); 00043 void parseURL(char *url); 00044 }; 00045 00046