#!/usr/bin/env ruby require 'rubygems' require 'hpricot' doc = Hpricot(open(ARGV[0])) # Nuke the credits doc.search("//meta[@name='GENERATOR']").remove doc.search("//meta[@name='generator']").remove els = doc.search("/html/body/blockquote/em/text()") els.each { |e| if e.to_s =~ /This document was/ doc.search(e.parent.parent.previous.previous.xpath).remove doc.search(e.parent.parent.xpath).remove end } # and HTML comments doc.search("//comment()").remove # clean up the subsections doc.search("//h2[@class='section']").remove_attr(:class) doc.search("//h3[@class='subsection']").remove_attr(:class) doc.search("//h4[@class='subsubsection']").remove_attr(:class) # Clean up the footnotes and references DL lists doc.search("//dl[@class='thebibliography']").set(:class, "refs") doc.search("//dl/dt[@class='dt-thefootnotes']").remove_attr(:class) doc.search("//dl/dd[@class='dd-thefootnotes']").remove_attr(:class) doc.search("//dl/dt[@class='dt-thebibliography']").remove_attr(:class) doc.search("//dl/dd[@class='dd-thebibliography']").remove_attr(:class) (doc/"//dl[@class='thefootnotes']").set(:class, 'footnotes') # Squish IDs into parents and remove unnecessary a's ["//h2/a", "//h3/a", "//h4/a"].each { |h| doc.search(h).each { |a| a.parent.set_attribute(:id, a.get_attribute(:id)) a.parent.replace_child(a, Hpricot::Text.new(a.inner_html)) } } ["//dl[@class='footnotes']/dt/a", "//dl[@class='refs']/dt/a"].each { |thing| doc.search(thing).each { |a| a.parent.set_attribute(:id, a.get_attribute(:id)) a.parent.inner_html = a.inner_html } } (doc/"//a").remove_attr(:name) (doc/"//td").remove_attr(:nowrap) # Clean up some of the figures (doc/"//blockquote[@class='table']").each { |e| e.swap("
\n"+e.inner_html+"\n
\n") } (doc/"//blockquote[@class='figure']").each { |e| e.swap("
\n"+e.inner_html+"\n
\n") } (doc/"//div[@class='caption']").each { |e| e.swap("

\n"+e.inner_html+"\n

\n") } (doc/"//div[@class='figure']/div[@class='center']/img").each { |e| e.parent.swap("

"+e.parent.inner_html+"

\n") } (doc/"//div[@class='figure']/div[@class='center']").each { |e| e.swap(e.inner_html) } (doc/"//div[@class='figure']/p[@class='caption']/table/tr/td").each { |e| e.parent.parent.swap(e.inner_html) } (doc/"//div[@class='figure']/p[@class='caption']/font").each { |e| e.swap(e.inner_html) } # Move a trailing label into the div (doc/"//div[@class='figure']/p[@class='caption']").each { |cap| ns = cap.next_sibling if (ns.name == "a" && ns.get_attribute(:id)) theid = ns.get_attribute(:id) ns.parent.children.delete(ns) cap.parent.set_attribute(:id, theid) end } doc.search("//dl[@class='refs']/dt/font").each { |f| f.parent.inner_html = f.inner_html } puts doc.to_html