#!/usr/local/bin/perl5

# /afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/cstar2/get-utt4phx.pl
# May 2001 by Dorcas Wallace  <dorcas@cs.cmu.edu>

# extract utt's and IFs for retagging sentences from the database
# sorted by client and agent speaker sides
# covers only utterances = "olang E  lang E  Prv CMU" (also possible to capture different Prv, though)

#---------------------------------------------------------------------------
# SAMPLE CALL
#
# > ./get-utt4phx.pl /afs/cs.cmu.edu/project/nnspeech-5/c-star/db/cstar-examples.db
# PRODUCES .phx and .spk files (LATER may add .tagged file)
#
# SORT
# > sort -n +0 +1 +2 all_db.phx.unsorted > ! all_db.phx
#
#---------------------------------------------------------------------------

#------------------------------------------------------------------------------
# PROCESS COMMAND LINE ARGUMENTS
#------------------------------------------------------------------------------

$filename = $ARGV[0];


#------------------------------------------------------------------------------
# OUTPUT FILES
#------------------------------------------------------------------------------

# Extract base file name
$filename =~ s/^.*\///;
$filename =~ s/.db$//;

$outfn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.client.phx.unsorted";
$out2fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.client.spk.unsorted";
$out3fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.agent.phx.unsorted";
$out4fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.agent.spk.unsorted";

$out5fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.all_db.phx.unsorted";
$out6fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.all_db.spk.unsorted";

# use /afs/cs.cmu.edu/project/nnspeech-5/c-star/db/cstar-examples.db as input file
my $dbfile = shift || die("You must specify an input file.\n");

open(DB, "$dbfile") || die("$dbfile does not exist.\n");
my @db_lines=<DB>;
close(DB);

open(OUT, $outfn);
open(OUT2, $out2fn);
open(OUT3, $out3fn);
open(OUT4, $out4fn);
open(OUT5, $out5fn);
open(OUT6, $out6fn);

&reset_items;

## NEW ##
$unique_id=0;

DB: foreach $db_line (@db_lines) {

    next DB if ( $db_line =~ m/^\s*$/ );         # skip line if blank
    next DB if ( $db_line =~ m/^.*comment.*$/ ); # skip line if comment

    if ($db_line =~ m/^(\d+\.\d+\.\d+).*\s+olang\s+E\s+lang\s+E\s+Prv\s+(.*)\s+\"(.*)\"\s*$/) {
	$num = $1;
	$prv = $2;
	$utt = $3;
#debug#
	print "UTT FOUND AT NUM $num $utt\n";

	$orig_utt=$utt;     # preserve original utterance here, then process below for comparison later

	$utt=~ s/^\s*//;    # remove spaces at beginning of utt
	$utt=~ s/\s*$//;    # remove spaces at end of utt

	$utt=~ s/^\</a\</;  # add char to angle bracket, so it is not removed
	$utt=~ s/\>$/\>a/;  # add char to angle bracket, so it is not removed
	$utt=~ s/^\[/a\[/;  # add char to square bracket, so it is not removed
	$utt=~ s/\]$/\]a/;  # add char to square bracket, so it is not removed

	$utt=~ s/^\W*//;    # remove punctuation at beginning of utt
	$utt=~ s/\W*$//;    # remove punctuation at end of utt

	$utt=~ s/^a\</\</;  # remove char from angle bracket
	$utt=~ s/\>a$/\>/;  # remove char from angle bracket
	$utt=~ s/^a\[/\[/;  # remove char from square bracket
	$utt=~ s/\]$a/\]/;  # remove char from square bracket

	$utt=~ tr/A-Z/a-z/; # lowercase entire string	

## NEW ##
## ASSIGN UNIQUE IDENTIFIER TO EACH UTT apart from db number, which doesn't sort properly ##
	$unique_id++;
	$all_db{$unique_id}=$utt;

	$all_db_prv{$unique_id}=$prv;

#debug#
	print "PROCESSED UTT $unique_id $utt\n";

## COMPARE UTT WITH PREVIOUS UTTS TO ELIMINATE DUPLICATES
#	if ($count{$unique_id}) { ## if utt was seen before and counted
#	    $count{$unique_id}++; ## then add to its count
#	    $skip_num=$num; ## and skip the IF line 
#	}
#	else {              ## if not seen before, then begin counting it
#	    $count{$unique_id}=1;
#	}
    }
## NOT DOING THIS for making .phx and .spk files, keep entire dialog incl duplicates

    if ($db_line =~ m/^(\d+\.\d+\.\d+).*\s+IF\s+Prv\s+.*\s+([ac])\:/) {
	$comp_num=$1;
	$spk     =$2;

	$comp_num =~ m/(\d+)\.(\d+)\.(\d+)/;
	$num_split="$1 $2 $3";
#debug#
	print "NUM SPLIT $num_split\n\n";

	$all_db_num_split{$unique_id}=$num_split;

#debug#		
	print "SPK FOUND: $spk\n";

	$all_db_spk{$unique_id}=$spk;

## NOT DOING THIS for making .phx and .spk files, want to be aligned with database
#	if ( $db_line =~ m/^.*noise.*$/ ) {  ## skip line if noise
#debug#	    print ">>>>>SKIP NOISE<<<<<\n";
#	    &reset_items;
#	    next DB;
#	}

	if ( $prv=~/ATR/ ) { ## skip utterances provided by ATR
#debug#	    print ">>>>>SKIP ATR UTT<<<<<\n";
	    $count{$unique_id}--;
	    &reset_items;
	    next DB;
	}

## NOT DOING THIS for making .phx and .spk files, keep entire dialog incl duplicates
#	if ($comp_num==$skip_num) {  ## skip IFline if utt seen before UNLESS it was seen for different speaker

	    if ($spk=~/c/) {
#		$cl_count{$unique_id}++;           # count it, whether it will be skipped or not
#		if (!$cl_first_seen{$unique_id}) { # if utt not seen for client side yet
		    &set_client_first;       # then don't skip it
#		}
	    }
	    elsif ($spk=~/a/) {
#		$ag_count{$unique_id}++;           # count it, whether it will be skipped or not
#		if (!$ag_first_seen{$unique_id}) { # if utt not seen for agent side yet
		    &set_agent_first;        # then don't skip it
#		}
	    }

## NOT SKIPPING
#debug#	    
#	    print ">>>>>SKIP DUPLICATE UTT<<<<<\n";
#	    &reset_items;

	    next DB;

## NOT USING SKIP_NUM IF STMT ABOVE
#	}

## NOT INITIALIZING SINCE NOT SKIPPING
## if not skipping, then this is first time utt seen for either client or agent
## so initialize counts and hashes here
#	if ($spk=~/c/) {
#	    $cl_count{$unique_id}=1;
#	    &set_client_first;
#	}
#	elsif ($spk=~/a/) {
#	    $ag_count{$unique_id}=1;
#	    &set_agent_first;
#	}

    }
}


foreach $key (sort keys %client) { ## print only for client utts
    print OUT "$cl_num_split{$key} $client{$key}\n";
    print OUT2 "c\n";
}

foreach $key (sort keys %agent) { ## print only for agent utts
    print OUT3 "$ag_num_split{$key} $agent{$key}\n";
    print OUT4 "a\n";
}

foreach $key (sort keys %all_db) { ## print for all utts
    print OUT5 "$all_db_num_split{$key} $all_db{$key}\n";
    print OUT6 "$all_db_num_split{$key} $all_db_spk{$key}\n";
}

close(OUT, $outfn);
close(OUT2, $out2fn);
close(OUT3, $out3fn);
close(OUT4, $out4fn);
close(OUT5, $out5fn);
close(OUT6, $out6fn);

#----------------------------------------------------------
# END MAIN
#----------------------------------------------------------

sub reset_items {
    $num="";
    $prv="";
    $utt="";
    $orig_utt="";
    $skip_num="";
    $comp_num="";
    $spk="";
}

sub write_db_items {
    $all_db{$unique_id}=$orig_utt;
    $all_db_if{$unique_id}=$db_line;
    $all_db_num{$unique_id}=$num_split;
    $all_db_prv{$unique_id}=$prv;
    $all_db_spk{$unique_id}=$local_spk;
}

## USE THIS SUBROUTINE BUT DO NOT SET $cl_first_seen
sub set_client_first {
#    $cl_first_seen{$unique_id}=1;
    $client{$unique_id}=$orig_utt; ## hash of client utts only
    $cl_if{$unique_id}=$db_line;   ## hash of IFs for each client utt
    $cl_num{$unique_id}=$num;      ## hash of numbers for each client utt
    $cl_prv{$unique_id}=$prv;      ## hash of providers for each client utt

    $num =~ m/(\d+)\.(\d+)\.(\d+)/;
    $num_split="$1 $2 $3";
#debug#
    print "NUM SPLIT $num_split\n\n";

    $cl_num_split{$unique_id}=$num_split;

    $local_spk="c";
#    &write_db_items;
    &reset_items;
    next DB;
}

## USE THIS SUBROUTINE BUT DO NOT SET $ag_first_seen
sub set_agent_first {
#    $ag_first_seen{$unique_id}=1;
    $agent{$unique_id}=$orig_utt;  ## hash of agent utts only
    $ag_if{$unique_id}=$db_line;   ## hash of IFs for each agent utt
    $ag_num{$unique_id}=$num;      ## hash of numbers for each agent utt
    $ag_prv{$unique_id}=$prv;      ## hash of providers for each agent utt

    $num =~ m/(\d+)\.(\d+)\.(\d+)/;
    $num_split="$1 $2 $3";
#debug#
    print "NUM SPLIT $num_split\n\n";

    $ag_num_split{$unique_id}=$num_split;

    $local_spk="a";
#    &write_db_items;
    &reset_items;
    next DB;
}
