#!/usr/local/bin/perl5

# Make test set for parsing by cleaning transcription marks from db input lines
# First input file is database file from which IFs have already been selected.
# Second input file is file of IFs for one speaker side (use grep, see example below)
# > grep "IF  Prv CMU   c" cstar-examples.db > cstar-examples.client-ifs

# this script checks each English utterance in the db input file and prints it to 
# a .phx test set if the utterance number matches a number in the IFs input file

my $dbfile = shift || die("You must specify a database file.\n");

open(DB, "$dbfile") || die("$dbfile does not exist.\n");
my @db_lines = <DB>;
close(DB);

my $iffile = shift || die("You must specify a file of IFs.\n");

open(IFS, "$iffile") || die("$iffile does not exist.\n");
my @if_lines = <IFS>;
close(IFS);

my $phxfile = "$iffile-revised.phx";
open(PHX, ">$phxfile");

my $newiffile = "$iffile-revised.ifs";
open(NEW, ">$newiffile");

%ifs=();
$refnum=0;
$skipped=0;
$totalif=0;

# make a hash of all the IFs for which we want to retrieve an utterance and print to test set

HASH:foreach $if_line (@if_lines) {
    chomp($if_line);
    if ( ($if_line =~ m/\:empty/) ||
	 ($if_line =~ m/\:no\-tag/) ||
	 ($if_line =~ m/\:not\-understandable/) ||
	 ($if_line =~ m/\:descriptive/) ) {
	$skipped++;
	next HASH;
    }
    $if_line =~ m/^(\d+\.\d+\.\d+)/;
    $refnum = $1;

#    print "STORED IF $refnum\n";
    print NEW "$if_line\n";
    $totalif++;
    $ifs{$refnum} = $if_line;
}

print "\nSKIPPED $skipped IFs\n\n";
print "\nTOTAL OF $totalif IFs STORED\n\n";

%utts=();
$refnum=0;
$totalutt=0;

# retrieve utterances from the database if they correspond to a number in the IFs hash

my $cur_utt;
foreach $db_line (@db_lines) {
    chomp($db_line);

# NOTE: CHANGE LANGUAGE IF ENGLISH UTTERANCES ARE NOT DESIRED
    if ($db_line =~ m/^(\d+\.\d+\.\d+).*\s+lang ENG\s+Prv\s+\w+/) {
	$refnum = $1;

	if ($ifs{$refnum}) {

	    # Utterance is between 1st 2 quotes
	    $db_line =~ s/^(\S*)[^\"]*\"([^\"]*)(\".*)?$/$2/o;	
	    $cur_utt = &clean_utterance($db_line);

	    $utts{$refnum} = $cur_utt;
#	    print PHX "$refnum $cur_utt\n";
	    print PHX "$cur_utt\n";
	    $totalutt++;
	}

    }

#	print "CUR UTT: $cur_utt\n";
#	print "CUR SPK: $cur_spk\n\n";

	# Unless the utterance is undefined or is empty,
	# write the utterance and its corresponding speaker to files.
#	unless ( !defined($cur_utt)
#		|| $cur_utt =~ m/^\s*$/) {
    
}

close(PHX);

# fix database numbering errors if you get this warning
# then fix input IF file numbering and try again

if ($totalif ne $totalutt) {
    print "WARNING: MISMATCH\n";
    print "$totalif TOTAL IFs\n";
    print "$totalutt TOTAL UTTs\n\n";
}

#--- END MAIN CODE ---

# Clean up an utterance (remove transcription markers, punctuation, etc.)
sub clean_utterance {
    my $utt = shift(@_);

    # Remove empty utterance markers (from translation)
    $utt =~ s/<empty>/ /go;
    # Remove noise
    $utt =~ s/<noise>/ /go;
    # Remove false start markers
    $utt =~ s/-\// /go;
    $utt =~ s/\/-/ /go;
    # Remove repetition and correction markers
    $utt =~ s/\+\// /go;
    $utt =~ s/\/\+/ /go;
    # Remove pause markers
    $utt =~ s/<P>/ /go;
    # Remove breath markers
    $utt =~ s/<B>/ /go;
    $utt =~ s/<A>/ /go;
    # Remove filled pause markers
    $utt =~ s/<uh>/ /go;
    $utt =~ s/<uhm>/ /go;
    $utt =~ s/<hm>/ /go;
    $utt =~ s/<hes>/ /go;
    #$utt =~ s/<\"ah>/ /go;
    #$utt =~ s/<\"ahm>/ /go;
    # Remove human noise markers
    $utt =~ s/<Smack>/ /go;
    $utt =~ s/<Laugh>/ /go;
    $utt =~ s/<Swallow>/ /go;
    $utt =~ s/<Cough>/ /go;
    $utt =~ s/<Throat>/ /go;
    $utt =~ s/<Noise>/ /go;
    # Remove incomprehensible words markers
    $utt =~ s/<%>/ /go;
    # Remove hard-to-identify/uncertain word markers
    $utt =~ s/%//go;
    # Remove foreign word markers
    $utt =~ s/<\*\w\w\w>//go;
    # Remove aborted articulation (unfinished word) markers
    $utt =~ s/\w*=/ /go;
    # Remove technical interruption markers
    $utt =~ s/<\*T>t/ /go;
    $utt =~ s/<\*T>/ /go;
    $utt =~ s/<T_>\w*/ /go; # Remove interrupted word, too.
    $utt =~ s/\w*<_T>/ /go; # Remove interrupted word, too.
    # Remove neologism markers
    $utt =~ s/\*//go;
    # Remove articulatory interruption (interrupted word) markers
    $utt =~ s/_[^_]*_//go;
    # Remove punctuation
    $utt =~ s/[?.,<>()!_\-\/]/ /go;

    $utt =~ tr/ / /s;
    $utt =~ s/^\s*//o;
    $utt =~ s/\s*$//o;
    $utt = lc($utt);

    return($utt);
}
