#!/usr/local/bin/perl
#
# This perl script reads WSJ text data (from the TIPSTER corpus) and
# produces output in which only the <LP>,...</LP> (lead paragraph) and
# <TEXT>...</TEXT> (text body) sections are reproduced, and these are
# separated and tagged at paragraph boundaries.  Output has one
# paragraph per line, with <p> and </p> on adjacent lines; each
# article, corresponding to a <DOC>...</DOC> unit, is bounded by
# <art>...</art>; start tags include the TIPSTER document number;
# paragraph start tags also include an index number (starting at 1 at
# the beginning of each article).
#
# Adapted by David Graff, LDC, from Doug Paul's "getsentproc" perl
# script.
#

for($i=0;$i<=$#ARGV;$i++)
{	if($ARGV[$i] =~ /^-/)
	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
		else {&perr("illegal flag: $ARGV[$i]");}
	}
	else
	{	if($fileid) {&perr("multiple fileid arg");}
		$fileid=$ARGV[i];
	}
}
@ARGV=();
# if(!$fileid) {&perr("no abbreviation file specified"); }

$texton=0;
$pargon=0;
$getdocno=0;

while(<>)
{
    chop;	# remove \n at end of line
    if ( /<DOCNO>/ ) {		# always precedes text; next line has docno
	$getdocno = 1;
    }
    elsif ( $getdocno ) {	# store the docno, tag start of article
	$getdocno = 0;
	s/\s+//g;
	$docno = $_;
	print "<art.$_>";
    }
    elsif ( /<LP>/ ) {		# lead paragraph = true start of text
	$texton = 1;		#    will be on next line
	$parno = 0;
    }
    elsif ( /<\/TEXT>/ ) {	# end of text = effective end of document
	print "\n</p>\n</art>\n";
	$pargon = 0;
	$texton = 0;
    }
    elsif ( $texton ) {		# paragraph tagging happens here

	if ( /<.*>/ || /^\s*-+\s*$/ ) {
	    next;	# ignore lines with tags, or just dashes
	}
	if ( $pargon && /^\s*$/ ) {	# blank line =  end of paragraph, 
	    print "\n</p>";		#  not start of next
	    $pargon = 0;
	    next;
	}
	elsif ( /^   [A-Z"(]/ ) {	# start of paragraph
	    if ( $pargon ) {
		print "\n</p>";		# tag end of previous paragraph
	    }
	    elsif ( /^   [A-Z].* --/ ) {
		if ( /:$/ ) { next; }   # delete subtitles and datelines
		s/^   [A-Z][A-Z].* --//;
	    }
	    $parno++;
	    $pargon = 1;
	    print "\n<p.$docno.$parno>\n";
	}
	if ( $pargon ) {
	    s/^\s+//;		# eliminate line-initial white space
	    s/\s+/ /g;		# normalize line-internal white space
	    print "$_ ";	# print result with a trailing space
	}
    }
}

sub perr
{	print STDERR "getsent: $_[0]\n";
	exit(1);
}

