#!/usr/local/bin/perl5

# Make test set for parsing by cleaning transcription marks from db input lines

my $dbfile = shift || die("You must specify an input file.\n");

open(DB, "$dbfile") || die("$dbfile does not exist.\n");
my @db_lines = <DB>;
close(DB);

my $phxafile = "$dbfile-agent-sdus.phx";
open(TEMPA, ">$phxafile");

my $spkafile = "$dbfile-agent-sdus.spk";
open(TEMPA2, ">$spkafile");

my $dbafile = "$dbfile-agent-sdus.db";
open(TEMPA3, ">$dbafile");

my $phxcfile = "$dbfile-client-sdus.phx";
open(TEMPC, ">$phxcfile");

my $spkcfile = "$dbfile-client-sdus.spk";
open(TEMPC2, ">$spkcfile");

my $dbcfile = "$dbfile-client-sdus.db";
open(TEMPC3, ">$dbcfile");

my $cur_utt;
foreach $db_line (@db_lines) {
    chomp($db_line);
    if ($db_line =~ m/\s+lang ENG\s+Prv\s+\w+/) {
	# Utterance is between 1st 2 quotes
	$db_line =~ s/^(\S*)[^\"]*\"([^\"]*)(\".*)?$/$2/o;	
	$cur_utt = &clean_utterance($db_line);
    }
    elsif ($db_line =~ m/IF\s+Prv\s+\w+\s+\S/) {
	# IF lines
	$db_line =~ s/^\s*//o;
	$db_line =~ s/\s*$//o;
	$db_line =~ s/^(\S*).*Prv \w+\s+(.*)\s*$/$2/o;
	$db_line =~ s/^([ac]:)\s*/$1/o;

	$cur_spk = $1;
	$cur_spk =~ s/([ac]):/$1/;

	$db_line =~ tr/[A-Z]/[a-z]/; # Added 15Aug2001
#	    $db_line =~ lc($db_line); # Added 15Aug2001

#	print "CUR UTT: $cur_utt\n";
#	print "CUR SPK: $cur_spk\n\n";

	# Unless the utterance is undefined or is empty,
	# write the utterance and its corresponding speaker to files.
	unless ( !defined($cur_utt)
		|| $cur_utt =~ m/^\s*$/) {
	    if ( $cur_spk eq "a" ) {

#		print "CUR SPK: $cur_spk\n\n";
#		print "CUR DBLINE: $db_line\n\n";		

		print TEMPA "$cur_utt\n";
		print TEMPA2 "$cur_spk\n";
		print TEMPA3 "$db_line\n";
	    }
	    else {

#		print "CUR SPK: $cur_spk\n\n";
#		print "CUR DBLINE: $db_line\n\n";		

		print TEMPC "$cur_utt\n";
		print TEMPC2 "$cur_spk\n";
		print TEMPC3 "$db_line\n";
	    }
	}
    # Get rid of the old utterance and speaker info before processing next input line
    undef($cur_utt);
    undef($cur_spk);
    }
}

close(TEMPA);
close(TEMPA2);
close(TEMPA3);
close(TEMPC);
close(TEMPC2);
close(TEMPC3);

# Clean up an utterance (remove transcription markers, punctuation, etc.)
sub clean_utterance {
    my $utt = shift(@_);

    # Remove empty utterance markers (from translation)
    $utt =~ s/<empty>/ /go;
    # Remove noise
    $utt =~ s/<noise>/ /go;
    # Remove false start markers
    $utt =~ s/-\// /go;
    $utt =~ s/\/-/ /go;
    # Remove repetition and correction markers
    $utt =~ s/\+\// /go;
    $utt =~ s/\/\+/ /go;
    # Remove pause markers
    $utt =~ s/<P>/ /go;
    # Remove breath markers
    $utt =~ s/<B>/ /go;
    $utt =~ s/<A>/ /go;
    # Remove filled pause markers
    $utt =~ s/<uh>/ /go;
    $utt =~ s/<uhm>/ /go;
    $utt =~ s/<hm>/ /go;
    $utt =~ s/<hes>/ /go;
    #$utt =~ s/<\"ah>/ /go;
    #$utt =~ s/<\"ahm>/ /go;
    # Remove human noise markers
    $utt =~ s/<Smack>/ /go;
    $utt =~ s/<Laugh>/ /go;
    $utt =~ s/<Swallow>/ /go;
    $utt =~ s/<Cough>/ /go;
    $utt =~ s/<Throat>/ /go;
    $utt =~ s/<Noise>/ /go;
    # Remove incomprehensible words markers
    $utt =~ s/<%>/ /go;
    # Remove hard-to-identify/uncertain word markers
    $utt =~ s/%//go;
    # Remove foreign word markers
    $utt =~ s/<\*\w\w\w>//go;
    # Remove aborted articulation (unfinished word) markers
    $utt =~ s/\w*=/ /go;
    # Remove technical interruption markers
    $utt =~ s/<\*T>t/ /go;
    $utt =~ s/<\*T>/ /go;
    $utt =~ s/<T_>\w*/ /go; # Remove interrupted word, too.
    $utt =~ s/\w*<_T>/ /go; # Remove interrupted word, too.
    # Remove neologism markers
    $utt =~ s/\*//go;
    # Remove articulatory interruption (interrupted word) markers
    $utt =~ s/_[^_]*_//go;
    # Remove punctuation
    $utt =~ s/[?.,<>()!_\-\/]/ /go;

    $utt =~ tr/ / /s;
    $utt =~ s/^\s*//o;
    $utt =~ s/\s*$//o;
    $utt = lc($utt);

    return($utt);
}
