#!/usr/local/bin/perl

###############################################################################
# This software is being provided to you, the LICENSEE, by the Massachusetts  #
# Institute of Technology (M.I.T.) under the following license.  By           #
# obtaining, using and/or copying this software, you agree that you have      #
# read, understood, and will comply with these terms and conditions:          #
#                                                                             #
# Permission to use, copy, modify and distribute, including the right to      #
# grant others the right to distribute at any tier, this software and its     #
# documentation for any purpose and without fee or royalty is hereby granted, #
# provided that you agree to comply with the following copyright notice and   #
# statements, including the disclaimer, and that the same appear on ALL       #
# copies of the software and documentation, including modifications that you  #
# make for internal use or for distribution:                                  #
#                                                                             #
# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
# reserved.                                                                   #
#                                                                             #
# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
# TRADEMARKS OR OTHER RIGHTS.                                                 #
#                                                                             #
# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
# used in advertising or publicity pertaining to distribution of the          #
# software.  Title to copyright in this software and any associated           #
# documentation shall at all times remain with M.I.T., and USER agrees to     #
# preserve same.                                                              #
###############################################################################

# bugproc
# removes some bugs in WSJ formatting
# checks for proper nesting of paragraphs and sentences
# assumes 1 sentence per line
# eliminates blank paragraphs and sentences
#

while(<>)
{	s/\n//;
#	if($_ eq "") {next;}		# blank line

	if(/<s/)		# start sentence
	{	s/\s//g;
		$sent=$_;
		$se=$_;
		if(!$pe) {&perr("nesting err 1");}
		next;
	}
	if(/<\/s>/)		# end of sentence
	{	if(!$pe) {&perr("nesting err 2");}
		if($se && !$sent) {print "</s>\n";}
		$sent="";
		$se="";
		next;
	}
	if(/<p/)		# start paragraph
	{	s/\s//g;
		$par=$_;
		$pe=$_;
		if($se) {&perr("nesting err 3");}
		next;
	}
	if(/<\/p>/)		# end of paragraph
	{	if($se) {&perr("nesting err 4");}
		if($pe && !$par) {print "</p>\n";}
		$par="";
		$pe="";
		next;
	}

	if(/<.*>/)
	{	&perr("spurious SGML: $_");
		next;
	}

	s/\t/ /g;			# separate everything by spaces
	s/^/ /;
	s/$/ /;

	if(/^ *@/ || /\.{5,}/) {next;}		# remove tables
	if(/^ *[a-z]\.?-/) {next;}			# remove table notes
	if(/^ *\*/) {next;}			# remove footnotes
	s/(\D)\*\*(\D)/$1 $2/g;			# remove footnote references
	s/(\D)\*(\D)/$1 $2/g;			# remove footnote references
	s/(\D)\*(\D)/$1 $2/g;			# remove footnote references

				# WSJ bug fixes
	s/(\d\/) +(\d)/$1$2/g;			# remove space from frac
	s/(\d) +(\/\d)/$1$2/g;			# remove space from frac

	s/(\w)\(/$1 (/g;			# eg. x( -> x (
	s/\)(\w)/) $1/g;			# eg. )x -> ) x

	s/]/!/g;				# WSJ89 uses ] for !
	s/(\d)\((\d)/$1 ($2/g;			# \d(\d
	s/(\d)\)(\d)/$1) $2)/g;			# \d)\d
	s/([a-zA-Z]{2,}\.)(\d)/$1 $2/g;		# eg. Sept.30
	s/,([a-zA-Z])/, $1/g;			# eg. 20,Smith
	s/>//g;					# WSJ88: spurious >
	if(/^[\s-]*$/) {next;}			# just -'s
	s/(\W)milion(\W)/$1million$2/g;		# spelling err

	s/(\W&\s*)Co([^\w\.-])/$1Co.$2/g;	# "& Co" -> "& Co."
	s/(\WU\.S)([^\.\w])/$1.$2/g;		# U.S -> U.S.

				# forbidden symbols
	if(/</) {&perr("<");}				# <
	if(/>/) {&perr(">");}				# >
	if(/_/) {&perr("_");}				# _

	s/\s{2,}/ /g;
	s/^ //;
	s/ $//;
	if($_)
	{	if($par)
		{	print "$par\n";
			$par="";
		}
		if($sent)
		{	print "$sent\n";
			$sent="";
		}
		print "$_\n";
	}
}

sub perr
{	if($se) { print STDERR "bugproc: sent=$se: line=$.: $_[0]\n"; }
	elsif($pe) { print STDERR "bugproc: par=$pe: line=$.: $_[0]\n"; }
	else { print STDERR "bugproc: line=$.: $_[0]\n"; }
}
