00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.cs.cmu.edu/~lemur/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _PARSER_HPP 00013 #define _PARSER_HPP 00014 00015 #include "TextHandler.hpp" 00016 #include "WordSet.hpp" 00017 00025 00026 class Parser : public TextHandler { 00027 public: 00028 static const string category; 00029 static const string identifier; 00030 00031 Parser(); 00032 virtual ~Parser(); 00033 00036 virtual void parse(const string &filename) { parseFile(filename); } ; 00037 00039 virtual void parseFile(const string &filename) = 0; 00040 00042 virtual void parseBuffer(char * buf, int len) = 0; 00043 00047 virtual void setAcroList(const WordSet * acronyms); 00048 00050 virtual void setAcroList(string filename); 00051 00053 virtual long fileTell() = 0; 00054 00056 virtual long getDocBytePos() { return docpos; } 00057 00058 protected: 00061 bool isAcronym(const char * word); 00063 void clearAcros(); 00064 00065 long docpos; 00066 00067 private: 00069 WordSet * myacros; 00070 const WordSet* borrowedacros; 00071 }; 00072 00073 #endif