#!/usr/local/bin/perl

# This perl script reads San Jose Mercury (SJM) newswire text data
# from the TIPSTER corpus and produces output in which only the
# <LP>...</LP> (lead paragraph) and <TEXT>...</TEXT> (text body)
# sections are reproduced, and these are separated and tagged at
# paragraph boundaries.  Output has one paragraph per line, with <p>
# and </p> on adjacent lines; each article, corresponding to a
# <DOC>...</DOC> unit, is bounded by <art>...</art>; start tags
# include the TIPSTER document number; paragraph start tags also
# include an index number (starting at 1 at the beginning of each
# article).

# Peculiarities of the SJM data include:
#    - tags are on the same line with their contents (not alone on a
# line); if contents are short, start & end tag are on the same line
#    - paragraph breaks are marked by a period+semicolon sequence (not
# by line feeds or leading spaces)
#    - the <LEAD_PARAGRAPH> and <TEXT> sections are typically not
# adjacent in the text stream; other tagged elements intervene (but
# <LEAD_PARAGRAPH> always comes first)
#    - as published on the TIPSTER Vol.3 cdrom, some non-alphabetic
# characters have been converted to SGML entities, as follows:
#          char   converted to
#           &      &amp;
#           <      &lanb; (?)
#           >      &ranb; (?)
#           [      &lsqb;
#           ]      &rsqb;
#           +      &plus;
#           -      &minus;
# (The TIPSTER readme file for SJM data on the Volume 3 disc mentions
# that only the first three items listed above are done consistently;
# some of the `` [ ] + - '' escaped conversion.)

$texton = 0;
$docno = "none";

while (<>) {
    s/\n/ /;
    s/\s+/ /g;
    s/^ //g;
    s/\&amp;/&/g;
    s/\&lsqb;/[/g;
    s/\&rsqb;/]/g;
    s/\&plus;/+/g;
    s/\&minus;/-/g;

    if ( /<DOCNO> (.+) <\/DOCNO>/ ) {
	if ( $docno cmp "none" ) { print "</art>\n"; }
	$docno = $1;
	$parno = 0;
	next;
    }
    if ( /<LEAD_PARAGRAPH> / || /<TEXT> / ) {
	if ( /<\/[LT].+>/ ) {
	    next;
	}
	$_ = $';  #' retain material following the tag
	if ( !$parno ) { print "<art.$docno>\n"; }
	$parno++;
	$texton = 1;
	$thispar = '';
	print "<p.$docno.$parno>\n";
    }
    if ( $texton ) {
	if ( /<\/[LT].+>/ ) {
	    $_ = $`;   # retain material preceding the tag;
	    $texton = 0;
	}
	while ( /[!?\.][")]*; / ) {
	    $match = $&;
	    $thispar .= $`.substr( $match, 0, rindex( $match, ';' ));
	    $_ = $';  #' retain material following the paragraph break;
	    $parno++;
	    print "$thispar\n</p>\n<p.$docno.$parno>\n";
	    $thispar = '';
	}
	$thispar .= $_;
    }
    if ( !$texton && $thispar cmp '' ) {
	print "$thispar\n</p>\n";
	$thispar = '';
    }
}
