%option noyywrap %option never-interactive %option prefix="web" %{ /*========================================================================== * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. * * Use of the Lemur Toolkit for Language Modeling and Information Retrieval * is subject to the terms of the software license set forth in the LICENSE * file included with this software, and also available at * http://www.cs.cmu.edu/~lemur/license.html * *========================================================================== */ #include "WebParser.hpp" #include #include #include #include #include #include #define B_DOC 1 #define E_DOC 2 #define B_DOCHDR 12 #define E_DOCHDR 13 #define F_DOCNO 3 #define B_DOCOLDNO 4 #define E_DOCOLDNO 5 #define B_SCRIPT 6 #define E_SCRIPT 7 #define B_COMMENT 8 #define E_COMMENT 9 #define WORD 11 #define UPWORD 18 #define CONTRACTION 14 #define ACRONYM 15 #define ACRONYM2 17 #define UNKNOWN 16 #define B_STYLE 19 #define E_STYLE 20 extern FILE * webin; extern char * webtext; long webloc; %} %% "" {webloc += webleng; return B_DOC; } "" {webloc += webleng; return E_DOC; } "" {webloc += webleng; return B_DOCHDR; } "" {webloc += webleng; return E_DOCHDR; } ""[^<]*"" {webloc += webleng; return F_DOCNO; } "" {webloc += webleng; return B_DOCOLDNO; } "" {webloc += webleng; return E_DOCOLDNO; } "<"[sS][tT][yY][lL][eE][^>]*">" {webloc += webleng; return B_STYLE; } "" {webloc += webleng; return E_STYLE; } "<"[sS][cC][rR][iI][pP][tT][^>]*">" {webloc += webleng; return B_SCRIPT; } "" {webloc += webleng; return E_SCRIPT; } "" {webloc += webleng; return E_COMMENT; } "<"[/]?[a-zA-Z][^>]*">" {webloc += webleng; /* zap tags */ } "]*">" {webloc += webleng; /* zap other tags*/} [&][a-zA-Z]+[;] {webloc += webleng; /* zap symbols */ } [&][#][0-9]*[;] {webloc += webleng; /* zap symbols */ } [A-Z][A-Z]+ {webloc += webleng; return UPWORD; } [a-zA-Z0-9]+ {webloc += webleng; return WORD; } [A-Z][A-Z]+((\')?[s])? {webloc += webleng; return ACRONYM2; } [a-zA-Z0-9]+\'[a-zA-Z]+ {webloc += webleng; return CONTRACTION;} [A-Z]\.([A-Z]\.)+ {webloc += webleng; return ACRONYM; } [\n] {webloc += webleng; /* zap newline */ } . {webloc += webleng; return UNKNOWN; } %% #define OUTER 0 #define DOC 1 #define DOCOLDNO 7 #define DOCHDR 3 #define SCRIPT 4 #define COMMENT 5 #define TAG 6 #define SYMBOL 8 #define STYLE 9 WebParser::WebParser() { state = OUTER; } long WebParser::fileTell() { return webloc; } void WebParser::parseFile(char * filename) { webloc = 0; webin = fopen(filename, "r"); doParse(); fclose(webin); } void WebParser::parseBuffer (char* buf, int len) { yy_scan_bytes(buf, len); doParse(); } void WebParser::doParse() { int tok; // The core loop of the parser. // The parser is state based. Encountering a tag // can transition the machine into another state. // When in the DOC state, text is parsed. while (tok = weblex()) { switch (tok) { case E_DOC: state = OUTER; break; case B_DOC: docpos = webloc - webleng; state = DOC; break; case F_DOCNO: { // Extract the document number and pass it on. char * dn = webtext + 7; while (isspace(*dn)) dn++; char * de = dn; while (!isspace(*de) && *de != '<') de++; *de = '\0'; if (textHandler != NULL) textHandler->foundDoc(dn); state = DOC; break; } case B_DOCHDR: // DOCHDRs are ignored state = DOCHDR; break; case E_DOCHDR: state = DOC; break; case B_DOCOLDNO: // DOCOLDNOs are ignored state = DOCOLDNO; break; case E_DOCOLDNO: state = DOC; break; case B_COMMENT: // Comments are ignored // Can only transition to the COMMENT state if in the DOC state, // handling comments within script tags appropriately. if (state == DOC) state = COMMENT; break; case E_COMMENT: if (state == COMMENT) state = DOC; break; case B_SCRIPT: // Script fields are ignored // Can only transition to a SCRIPT state if in the DOC state, // handling script tags within comment tags appropriately. if (state == DOC) state = SCRIPT; break; case E_SCRIPT: if (state == SCRIPT) state = DOC; break; case B_STYLE: // Style fields are ignored // Can only transition to a STYLE state if in the DOC state, // handling style tags within comment tags appropriately. if (state == DOC) state = STYLE; break; case E_STYLE: if (state == STYLE) state = DOC; break; case WORD: if (state == DOC) { // put the word in lowercase and pass it on for (char * c = webtext; *c != '\0'; c++) *(c) = tolower(*c); if (textHandler != NULL) textHandler->foundWord(webtext); } break; case CONTRACTION: if (state == DOC) { // strip the suffx, put the word in lowercase, pass it on char * c; for (c = webtext; *c != '\''; c++) *(c) = tolower(*c); *c = '\0'; if (textHandler != NULL) textHandler->foundWord(webtext); } break; case UPWORD: if (state == DOC) { if (! isAcronym(webtext)) { // put in lowercase if the word is not in the acronym list for (char * c = webtext; *c != '\0'; c++) *(c) = tolower(*c); } if (textHandler != NULL) textHandler->foundWord(webtext); } break; case ACRONYM: if (state == DOC) { char * e = webtext; // strip periods for (char * c = webtext; *c != '\0'; c++) { if (*c != '.') *(e++) = *c; } *e = '\0'; if (!isAcronym(webtext)) { // put in lowercase if the word is not in the acronym list for (char * c = webtext; *c != '\0'; c++) *(c) = tolower(*c); } if (textHandler != NULL) textHandler->foundWord(webtext); } break; case ACRONYM2: if (state == DOC) { char * c; // strip the suffix for (c = webtext; *c != '\'' && *c != '\0' && *c != 's'; c++); *c = '\0'; if (!isAcronym(webtext)) { // put in lowercase if the word is not in the acronym list for (c = webtext; *c != '\0'; c++) *(c) = tolower(*c); } if (textHandler != NULL) textHandler->foundWord(webtext); } break; } } }