###############################################################################
# This software is being provided to you, the LICENSEE, by the Massachusetts  #
# Institute of Technology (M.I.T.) under the following license.  By           #
# obtaining, using and/or copying this software, you agree that you have      #
# read, understood, and will comply with these terms and conditions:          #
#                                                                             #
# Permission to use, copy, modify and distribute, including the right to      #
# grant others the right to distribute at any tier, this software and its     #
# documentation for any purpose and without fee or royalty is hereby granted, #
# provided that you agree to comply with the following copyright notice and   #
# statements, including the disclaimer, and that the same appear on ALL       #
# copies of the software and documentation, including modifications that you  #
# make for internal use or for distribution:                                  #
#                                                                             #
# Copyright 1991-4 by the Massachusetts Institute of Technology.  All rights  #
# reserved.                                                                   #
#                                                                             #
# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR   #
# WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not limitation,     #
# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS #
# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR      #
# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS,        #
# TRADEMARKS OR OTHER RIGHTS.                                                 #
#                                                                             #
# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be  #
# used in advertising or publicity pertaining to distribution of the          #
# software.  Title to copyright in this software and any associated           #
# documentation shall at all times remain with M.I.T., and USER agrees to     #
# preserve same.                                                              #
###############################################################################


# WSJ0 text preprocessors:
# For documentation, see:
#
# 	Paul, D. B. and Baker, J. M., "The Design for the Wall Street
#	Journal-based CSR Corpus", Proc Speech and Natural Language
#	Workshop, Harriman, NY, Feb. 1992.
#		postscript file: WSJ-design.ps
#
#	See also viewgraphs from "SLS Mid-Term Meeting" held at CMU
#	in Oct 91.
#		postscript file WSJ-preproc.vg.ps
#
#			Douglas B. Paul
#			MIT Lincoln Laboratory
#			26 January 1994.

# NOTE: The programs described below are written in perl.  Perl is
#	available via anon FTP from prep.ai.mit.edu:/pub/gnu.

# NOTE:	The filenames used below (e.g. raw-text) are just documentation
#	names.  The actual file names used in the data distribution
#	are of the form <raw-text-filename> (defined below) under an
#	appropriate directory structure.


############################################################################
# general preprocessing:

cat raw-text | getsentproc <raw-text-filename> | bugproc \
	| numproc | abbrevproc > gen-text

# "raw-text" is a WSJ file from the ACL-DCI CDROM.
#	(Tipster CDROM WSJ data will NOT work.)
# Getsentproc extracts and marks the sentences on the CD-ROM.
#	This is VERY data-format sensitive.
#	<raw-text-filename>=<year>.<acl-dci-filenumber>
#	Notes:	<year> is 2 digits: e.g. 89
#		<acl-dci-filenumber> is left padded with zeros to make
#			all filenumbers the same length: e.g. 001
#	Creates the SGML WSJ sentence and paragraph ID marks.
# Bugproc corrects several minor data formatting bugs.
#	also removes table
# Numproc processes the numbers into textual form.
#	uses datafile num-exceptions: (VERY task dependent)
# Abbprevproc processes the abbreviations.
#	uses datafile abbreviations: (VERY task dependent)
############################################################################


############################################################################
# create verbalized punctuation texts:

cat gen-text | punctproc > punct-prompt-text
cat punct-prompt-text | lc2uc > punct-truth-text

# Punctproc performs the punctuation recognition and representation.
# lc2uc: lower-case to upper-case (NOT equivalent to "tr a-z A-Z")
############################################################################


############################################################################
# create non-verbalized punctuation texts:

cat gen-text | punctproc -np | lc2uc > nvp-truth-text
cat gen-text | gp2nvp-pr-proc > nvp-prompt-text

# Punctproc performs the punctuation recognition and representation
#	and then eliminates it.
# Gp2nvp-pr-proc makes a few changes to create the nvp-promt-text.
# lc2uc: lower-case to upper-case (NOT equivalent to "tr a-z A-Z")
############################################################################
