#!/usr/local/bin/perl

# This perl script reads WSJ text data (from the TIPSTER corpus) that
# has already been conditioned by "partag.perl" -- this means that the
# text data has been trimmed to include only the contents of the <LP>
# and <TEXT> tags, and these have been retagged using <art.docno> and
# <p.docno.parno>; the text for each paragraph is all on one line,
# with </p> on the next line.  Material that lies outside of
# paragraphs (i.e. tables and lists) has -- I hope -- all been left
# out.

# This script takes the single line of paragraph content and divides
# it up into sentences, tagging each sentence with
# <s.docno.parno.sentno>; each sentence is written as a single line,
# with </s> on the next line.

# Written by David Graff, LDC, with major help from Doug Paul, MIT-LL
#

$file="addressforms";		# default abbreviation file

#for($i=0,$j=0;$i<=$#ARGV;$i++)
#{	if($ARGV[$i] =~ /^-/)
#	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
#		else {&perr("illegal flag: $ARGV[$i]");}
#	}
#	else
#	{ #	if($file) {&perr("multiple file arg");}
#		$file=$ARGV[i];
#	}
#}
#@ARGV=();
#if(!file) {&perr("no abbreviation file specified"); }

if(!open(FILE,$file)) {&perr("cannot open abbreviation file"); }
while(<FILE>)
{	if(/^#/) {next;}	# comment
	s/\n//;
	if(!$_) {next;}		# blank
	$y=$_;
	s/^(\S+)\s+//;		# extract 1st word
	$x=$1;
	if(!$x) {&perr("no word: $y");}
	if(!$_) {&perr("no value: $y");}

	if($x =~ /\.$/)				# abbreviations
	{	if($x !~ /^[a-zA-Z][a-zA-Z\.]+\.$/)
			{&perr("illegal abbreviation: $x");}
		$x =~ s/\.$//;
		$abbrev{$x}=$_;
		if($x =~ /[a-z]/)
		{	$x =~ tr/a-z/A-Z/;	#UC version
			tr/a-z/A-Z/;
			$abbrev{$x}=$_;
		}
		if(length($x)>$maxabl) {$maxabl=length($x);}
		$n++;
	}
	else {&perr("not an abbreviation: $x");}
}
if($vflg) {print STDERR "$n lines read from file\n";}

while (<>) {
    if ( /^<p/ ) {   # start of paragraph:
	print $_;    #  copy to output
	chop;        #  remove trailing '\n'
	s/>/./;      #  replace close-bracket with '.'
	s/p/s/;      #  replace 'p' with 's' tag
	$stag = $_;  #  store for use as sentence start tag
	$sno = 1;    #  initialize sentence counter
	print "$stag$sno>\n";   # print the first tag
    }
    elsif ( /^</ ) {  # copy all other tags to output
	print $_;
    }
    else         # sentence separation and taggging is done here
    {
	while ( /[?!\.]['")]* [("']*[A-Z]+[' a-z]/ ) {
	    $match = $&;
	    $thiss = $`;
	    $nxtend = $';  #';
	    $lwindx = rindex( $thiss, ' ' )+1;
	    $lastw = substr( $thiss, $lwindx );
	    if ( $match =~ /^\./ &&  # check for mid-sentence abbrevs
	       ( $abbrev{ $lastw } ||
	       ( $thiss =~ /[ \.][A-Z]$/ && $match !~ /[("')]/ ))) {
		$match =~ s/\./|/;
	    	$_ = $thiss.$match.$nxtend;
	    } else {
		$perlen = 1;
		if ( $match =~ /^.[)"']/ ) { $perlen = 2; }
		$period = substr( $match, 0, $perlen++ );
		$nxtbgn = substr( $match, $perlen );
		$sno++;
		$thiss =~ s/\|/./g;
		print "$thiss$period\n</s>\n$stag$sno>\n";
		$_ = $nxtbgn.$nxtend;
	    }
	}
	s/\|/./g;
	print "$_</s>\n";
    }
}

sub perr
{       print STDERR "sentag: $_[0]\n";
        exit(1);
}
