" environment, then we simply include a // newline in the output. If we are in any other environment, we must guess // the correct number of spaces to put in based upon the punctuation of the // previous line. define method add-eol(text ::". This may be a null action, // or may call "break-up" to dump the accumulated text, or may perform any // other arbitrary action. define constant tag-close-table ::) => (result :: ); if (Pre-Count > 0) add!(text, "\n") else let Prev-Str = last(text, default: ""); if (Prev-Str.empty?) text; else let space = select (Prev-Str.last) '.', ':', '!', '?' => " "; '-', ' ' => ""; otherwise => " "; end select; add!(text, space); end if; end if end method add-eol; // The "break-up" routines produce and print appropriate formatted text from // the accumulated data. The action defaults to the #"text" action, which // breaks the text into lines (at word boundaries)according to the defined // margins. "break-up" then clears the accumulated text before returning // control to the main loop. define constant break-up-table :: = make( ); define method break-up(tag :: , text :: , blank :: , want-blank :: ) => (result :: ); let full-text = if (text.empty?) "" else apply(concatenate, text) end; block () break-up-table[tag](full-text, blank, want-blank); cleanup size(text) := 0; exception ( ) break-up-table[#"TEXT"](full-text, blank, want-blank); end block; end method break-up; // Tag close defines the appropriate action to take at the end of an // environment (i.e. when encountering "
". This may be a null action, // or may call "break-up" to dump the accumulated text, or may perform any // other arbitrary action. define constant tag-start-table ::= make( ); define method tag-start(New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (New-Text :: , blank :: ); let fun = block () tag-start-table[New-Tag]; exception ( ) signal("Unknown tag type: <%=>\n", New-Tag); tag-start-table[#"TEXT"]; end block; fun(New-Tag, Old-Tag, Out-Text, Current-Text, File, Blank); end method tag-start; // This routine is called at "load time" to build the tag action tables. Note // that "reasonable" defaults are defined for all actions so that only the // "specialized" actions for any given environment need be specified. define method add-tag(tags :: , #key add-text: AT = identity, break-up: BU = break-up-table[#"TEXT"], tag-close: TC = tag-close-table[#"TEXT"], tag-start: TS = tag-start-table[#"TEXT"]) for (tag in tags) let Tag-Symbol = as( , tag); add-text-table[Tag-Symbol] := AT; break-up-table[Tag-Symbol] := BU; tag-close-table[Tag-Symbol] := TC; tag-start-table[Tag-Symbol] := TS; end for; end method add-tag; //////////////////////////////////////////////////////////////////////// // Main Driver Routines // //////////////////////////////////////////////////////////////////////// // This is the workhorse routines. It reads in new data, searches for tags, // and dispatches the appropriate "add-text", "tag-start", and "tag-close" // routines. It also attempts to unwind gracefully when it encounters the end // of the file, since many HTML data files fail to terminate all environments. define method process-HTML(Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (Current-Text :: , blank :: ); local method is-space(ch) ch == ' ' | ch == '\t' end method; local method tag-end(ch) ch == ' ' | ch == '\t' | ch == '>' end method; local method not-space(ch) ch ~= ' ' & ch ~= '\t' end method; block (return) while (#t) // keep crunching until EOF causes us to call "return" let Start-Tag = sfind(Current-Text, curry(\==, '<')); if (Start-Tag) // There is a tag on this line, so we accumulate the text which // precedes it and then invoke the appropriate tag actions. Out-Text := add-text(Tag, Out-Text, subsequence(Current-Text, end: Start-Tag)); // If a newline occurs within a tag, we must keep reading until we get // the rest of the tag. Whitespace is simply used as a separator, so // we substitute a space for the newline. let End-Tag = for (index = sfind(Current-Text, curry(\==, '>'), start: Start-Tag) then sfind(Current-Text, curry(\==, '>'), start: Start-Tag), until: index) Current-Text := concatenate(Current-Text, " ", read-line(File)); finally index; end for; // Find the complete tag and figure out whether it is "opening" or // "closing" an environment. let first = sfind(Current-Text, not-space, start: Start-Tag + 1); let Is-Close = Current-Text[first] = '/'; if (Is-Close) first := sfind(Current-Text, not-space, start: first + 1) end if; let New-Tag = as( , copy-sequence(Current-Text, start: first, end: sfind(Current-Text, tag-end, start: first))); // Call the appropriate action for the tag. This may invoke // a recursive call to "process-HTML" for start tags and will exit // this recusive call for closing tags. Current-Text := copy-sequence(Current-Text, start: End-Tag + 1); if (Is-Close) return(Current-Text, tag-close(Tag, New-Tag, Out-Text, blank)); else let (New-Text, NewBlank) = tag-start(New-Tag, Tag, Out-Text, Current-Text, File, blank); Current-Text := New-Text; blank := NewBlank; end if; else // Process newlines. We ignore indentation in the next line unless we // are inside a " " environment. Out-Text := add-eol(add-text(Tag, Out-Text, Current-Text)); let (New-Text, eof) = read-line(File); let First-Real = if (Pre-Count = 0) sfind(New-Text, not-space, failure: 0); else 0 end if; Current-Text := if (First-Real > 0) copy-sequence(New-Text, start: First-Real); else New-Text; end if; end if; end while; exception () // End of file processing. Dump accumulated text and then exit. let blank = break-up(Tag, Out-Text, blank, #f); values("", blank); end block end method process-HTML; // specialized routines to open various sourts of streams and invoke // "process-HTML". define method html2text(fd :: ) => (); process-HTML(#"TEXT", make( ), "", fd, #t); force-output(*standard-output*); end method html2text; define method html2text(file :: ) => (); let stream = make( , locator: file); html2text(stream); end method html2text; define method html2text(file == #t) => (); html2text(make( , fd: 0)); end method html2text; // Trivial main program -- just invokes "html2text" which in turn invokes // "process-HTML". Note that we had to import the generic function "main" // from module "extensions" in library "dylan". This interface is Mindy // specific. define method main (argv0 :: , #rest args) => (); if (empty?(args)) html2text(#t); else map(html2text, args); end if; end method main; //////////////////////////////////////////////////////////////////////// // Specific Environment Routines // //////////////////////////////////////////////////////////////////////// // The anonymous methods here implement the appropriate tag actions for all of // the tags currently supported. Some are quite straightforward, while others // may require a twisted mind to "properly appreciate" them. This // organization does, at least, allow the processing of most tags to be // isolated so that you needn't grok all the code at once. add-tag(#["TEXT"], // Default environment // Performs a "paragraph break" and recursively processes the new // environment tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); let blank = break-up(Old-Tag, Out-Text, blank, #t); process-HTML(New-Tag, Out-Text, Current-Text, File, blank); end method, // Performs a "paragraph break" and returns to the enclosing // environment tag-close: method (tag :: , text :: , blank :: ) => (result :: ); break-up(tag, text, blank, #t); end method, // Breaks "text" into lines according to *margin* and *linelen*. // Parameters blank and want-blank say whether there is a blank line // before the current text and whether there should be one after the // current text. The return value tells whether a blank line was // printed. break-up: method (text :: , blank :: , want-blank :: ) => (result :: ); let first = sfind(text, curry(\~=, ' ')); if (~first) if (want-blank & ~blank) write-string("\n") end if; blank | want-blank else let Text-Size = size(text); let Find-Break = method (first, last) if (last >= Text-Size) Text-Size; else let find = rsfind(text, curry(\=, ' '), start: first, end: last); if (find) rsfind(text, curry(\~=, ' '), start: first, end: find) + 1 else sfind(text, curry(\=, ' '), start: first) | size(text) end if end if end method; while (first) let last = Find-Break(first, first + *linelen* - *margin*); print-with-prefix(text, start: first, end: last); first := sfind(text, curry(\~=, ' '), start: last + 1) end while; if (want-blank) write-string("\n") end if; want-blank end if end method); // This tag action is used for many different tags -- it simply invokes // "process-HTML" recursively without doing anything special to the // accumulated text. This is handy for "lightweight" enviromentents like // "". define constant tag-start-recurse = method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); process-HTML(New-Tag, Out-Text, Current-Text, File, blank); end method; // This tag action is a logical partner for "tag-start-recurse". It simply // exits so that control will return to an inclosing "process-HTML" call // without distrubing the accumulated text. define constant tag-close-nothing = method (tag :: , Out-Text :: , blank :: ) blank; end method; // Specialized "add-text" methods provide EMPHASIZED versions of "" or // "" style environments. add-tag(#["I", "EM", "CITE", "VAR", "DFN"], add-text: method(text :: ) => (result :: ); if (*Icap*) as-uppercase(text) else text end end method, tag-start: tag-start-recurse, tag-close: tag-close-nothing); add-tag(#["B", "STRONG"], add-text: method(text :: ) => (result :: ); if (*Bcap*) as-uppercase(text) else text end end method, tag-start: tag-start-recurse, tag-close: tag-close-nothing); // Anchors do nothing at all. add-tag(#["A", "HEAD", "BODY", "UNKNOWN", "TT", "CODE", "SAMP", "KBD"], tag-start: tag-start-recurse, tag-close: tag-close-nothing); // Titles are eliminated entirely -- add-text simply "adds" an empty string. add-tag(#["TITLE"], add-text: method(text :: ) => (res :: ); "" end method, tag-start: tag-start-recurse, tag-close: tag-close-nothing); // For un-bracketed environments like " ", "
", etc. we must make sure // "tag-start" does not start a recursive call to "process-HTML". We may or // may not want to dump accumulated text. add-tag(#["!"], tag-start: method (New-Tag ::, Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); values(Current-Text, blank); end method); add-tag(#["P"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); values(Current-Text, break-up(Old-Tag, Out-Text, blank, #t)); end method); add-tag(#["BR"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); if (Pre-Count > 0) add-eol(Out-Text); values(Current-Text, blank); else values(Current-Text, break-up(Old-Tag, Out-Text, blank, #f)); end if; end method); add-tag(#["HR"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); break-up(Old-Tag, Out-Text, blank, #t); write-line(*standard-output*, concatenate('-' * *linelen*, "\n")); values(Current-Text, #t); end method); add-tag(#["IMG"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); break-up(Old-Tag, Out-Text, blank, #t); write-line(*standard-output*, concatenate(' ' * (*margin* + 4), "*** INLINE IMAGE IGNORED ***\n")); values(Current-Text, #t); end method); // Preformatted text is tricky. First we dump accumulated text. Then we // increment the global variable "Pre-Count" which enables magic behavior in // several standard routines. Finally, when the environment is closed, we // split the output around the newlines and do line-by-line output so that the // left margin will be observed. add-tag(#["PRE"], break-up: method (text :: , blank :: , want-blank :: ) => (result :: ); unless(blank) new-line(*standard-output*) end; let first = sfind(text, curry(\~=, '\n')); let last = rsfind(text, complement(rcurry(member?, "\n "))); if (last) while (first < last) let endline = sfind(text, curry(\=, '\n'), start: first, failure: last + 1); print-with-prefix(text, start: first, end: endline); first := endline + 1; end while; end if; write-string("\n"); #t end method, tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); let blank = break-up(Old-Tag, Out-Text, blank, #t); block () Pre-Count := Pre-Count + 1; process-HTML(New-Tag, Out-Text, Current-Text, File, blank); cleanup Pre-Count := Pre-Count - 1; end block; end method); // Since the following methods add nested indentation levels, we create a // stack for the margins. A "document state" record might be cleaner, but is // probably overkill for this particular application. define constant margins :: = make( ); add-tag(#["UL", "OL", "MENU", "DL", "BLOCKQUOTE"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); break-up(Old-Tag, Out-Text, blank, #t); let OldCounter = counter; block () push(margins, *margin*); *margin* := *margin* + 4; counter := 0; process-HTML(New-Tag, Out-Text, Current-Text, File, blank); cleanup *margin* := pop(margins); counter := OldCounter; end block; end method); // The " " tag causes bullets or numbers to be printed before the // immediately following text. We use a global "prefix" variable to magically // change the behavior of the next call to "print-with-prefix". The precise // choice of prefix depends upon the enclosing environment. add-tag(#["LI"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); let blank = break-up(Old-Tag, Out-Text, blank, #f); if (Old-Tag = #"OL") counter := counter + 1; prefix := copy-sequence("0. "); prefix[0] := as( , counter + as( , '0')); else prefix := "* "; end if; values(Current-Text, blank); end method); // In " " environments, we must simply switch the left margin back and // forth between "unindented" and "indented" depending on whether we are // currently processing a "term" or a "definition". add-tag(#["DT"], tag-start: method (New-Tag ::
, Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); let blank = break-up(Old-Tag, Out-Text, blank, #f); *margin* := first(margins); values(Current-Text, blank); end method); add-tag(#["DD"], tag-start: method (New-Tag :: , Old-Tag :: , Out-Text :: , Current-Text :: , File :: , blank :: ) => (result :: , blank :: ); let blank = break-up(Old-Tag, Out-Text, blank, #f); *margin* := first(margins) + 4; values(Current-Text, blank); end method); // Headers may centered and/or underlined and ignore margins. They must still // be broken up into lines, although we use a shorter line-length. add-tag(#["H1"], break-up: method (text :: , blank :: , want-blank :: ) => (result :: ); unless(blank) new-line(*standard-output*) end; let first = sfind(text, curry(\~=, ' ')); let Text-Size = size(text); let Find-Break = method (first, last) if (last >= Text-Size) Text-Size; else let find = rsfind(text, curry(\=, ' '), start: first, end: last); if (find) rsfind(text, curry(\~=, ' '), start: first, end: find) + 1 else sfind(text, curry(\=, ' '), start: first) | size(text) end if end if end method; let Max-Length = 0; while (first) let last = Find-Break(first, first + *linelen* - 20); Max-Length := max(Max-Length, last - first); write-string(' ' * truncate/(*linelen* + first - last, 2)); write-line(*standard-output*, text, start: first, end: last); first := sfind(text, curry(\~=, ' '), start: last + 1) end while; if (*H1under*) write-string(' ' * truncate/(*linelen* - Max-Length, 2)); write-line(*standard-output*, '=' * Max-Length); end if; if (want-blank) write-string("\n") end if; want-blank end method); add-tag(#["H2"], break-up: method (text :: , blank :: , want-blank :: ) => (result :: ); unless(blank) new-line(*standard-output*) end; let first = sfind(text, curry(\~=, ' ')); let Text-Size = size(text); let Find-Break = method (first, last) if (last >= Text-Size) Text-Size; else let find = rsfind(text, curry(\=, ' '), start: first, end: last); if (find) rsfind(text, curry(\~=, ' '), start: first, end: find) + 1 else sfind(text, curry(\=, ' '), start: first) | size(text) end if end if end method; let Max-Length = 0; while (first) let last = Find-Break(first, first + *linelen* - 20); Max-Length := max(Max-Length, last - first); write-line(*standard-output*, text, start: first, end: last); first := sfind(text, curry(\~=, ' '), start: last + 1) end while; if (*H2under*) write-line(*standard-output*, '-' * Max-Length); #f; else new-line(*standard-output*); #t end if; end method); add-tag(#["H3", "H4", "H5", "H6"], break-up: method (text :: , blank :: , want-blank :: ) => (result :: ); unless(blank) new-line(*standard-output*) end; block () push(margins, *margin*); *margin* := 0; add-text-table[#"TEXT"](text, #t, want-blank); cleanup *margin* := pop(margins); end; end method);