00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _PARSER_HPP 00013 #define _PARSER_HPP 00014 00015 #include "TextHandler.hpp" 00016 #include "WordSet.hpp" 00017 00025 00026 class Parser : public TextHandler { 00027 public: 00028 static const string category; 00029 static const string identifier; 00030 00031 Parser(); 00032 virtual ~Parser(); 00033 00036 virtual void parse(const string &filename); 00037 00040 virtual void parseFile(const string &filename) = 0; 00041 00043 virtual void parseBuffer(char * buf, int len) = 0; 00044 00048 virtual void setAcroList(const WordSet * acronyms); 00049 00051 virtual void setAcroList(string filename); 00052 00054 virtual long fileTell() const = 0; 00055 00057 virtual long getDocBytePos() const { return docpos; } 00058 00060 virtual const string getParseFile() const { return parsefile; } 00061 00062 protected: 00065 bool isAcronym(const char * word); 00067 void clearAcros(); 00068 00069 long docpos; 00070 00071 string parsefile; 00072 private: 00074 WordSet * myacros; 00075 const WordSet* borrowedacros; 00076 }; 00077 00078 #endif