#!/usr/local/bin/perl5 # /afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/cstar2/get-utt4phx.pl # May 2001 by Dorcas Wallace # extract utt's and IFs for retagging sentences from the database # sorted by client and agent speaker sides # covers only utterances = "olang E lang E Prv CMU" (also possible to capture different Prv, though) #--------------------------------------------------------------------------- # SAMPLE CALL # # > ./get-utt4phx.pl /afs/cs.cmu.edu/project/nnspeech-5/c-star/db/cstar-examples.db # PRODUCES .phx and .spk files (LATER may add .tagged file) # # SORT # > sort -n +0 +1 +2 all_db.phx.unsorted > ! all_db.phx # #--------------------------------------------------------------------------- #------------------------------------------------------------------------------ # PROCESS COMMAND LINE ARGUMENTS #------------------------------------------------------------------------------ $filename = $ARGV[0]; #------------------------------------------------------------------------------ # OUTPUT FILES #------------------------------------------------------------------------------ # Extract base file name $filename =~ s/^.*\///; $filename =~ s/.db$//; $outfn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.client.phx.unsorted"; $out2fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.client.spk.unsorted"; $out3fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.agent.phx.unsorted"; $out4fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.agent.spk.unsorted"; $out5fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.all_db.phx.unsorted"; $out6fn =">/afs/cs.cmu.edu/project/nnspeech-6/Nespole/Data/recorded/English/devtest/scenario-a/$filename.all_db.spk.unsorted"; # use /afs/cs.cmu.edu/project/nnspeech-5/c-star/db/cstar-examples.db as input file my $dbfile = shift || die("You must specify an input file.\n"); open(DB, "$dbfile") || die("$dbfile does not exist.\n"); my @db_lines=; close(DB); open(OUT, $outfn); open(OUT2, $out2fn); open(OUT3, $out3fn); open(OUT4, $out4fn); open(OUT5, $out5fn); open(OUT6, $out6fn); &reset_items; ## NEW ## $unique_id=0; DB: foreach $db_line (@db_lines) { next DB if ( $db_line =~ m/^\s*$/ ); # skip line if blank next DB if ( $db_line =~ m/^.*comment.*$/ ); # skip line if comment if ($db_line =~ m/^(\d+\.\d+\.\d+).*\s+olang\s+E\s+lang\s+E\s+Prv\s+(.*)\s+\"(.*)\"\s*$/) { $num = $1; $prv = $2; $utt = $3; #debug# print "UTT FOUND AT NUM $num $utt\n"; $orig_utt=$utt; # preserve original utterance here, then process below for comparison later $utt=~ s/^\s*//; # remove spaces at beginning of utt $utt=~ s/\s*$//; # remove spaces at end of utt $utt=~ s/^\$/\>a/; # add char to angle bracket, so it is not removed $utt=~ s/^\[/a\[/; # add char to square bracket, so it is not removed $utt=~ s/\]$/\]a/; # add char to square bracket, so it is not removed $utt=~ s/^\W*//; # remove punctuation at beginning of utt $utt=~ s/\W*$//; # remove punctuation at end of utt $utt=~ s/^a\a$/\>/; # remove char from angle bracket $utt=~ s/^a\[/\[/; # remove char from square bracket $utt=~ s/\]$a/\]/; # remove char from square bracket $utt=~ tr/A-Z/a-z/; # lowercase entire string ## NEW ## ## ASSIGN UNIQUE IDENTIFIER TO EACH UTT apart from db number, which doesn't sort properly ## $unique_id++; $all_db{$unique_id}=$utt; $all_db_prv{$unique_id}=$prv; #debug# print "PROCESSED UTT $unique_id $utt\n"; ## COMPARE UTT WITH PREVIOUS UTTS TO ELIMINATE DUPLICATES # if ($count{$unique_id}) { ## if utt was seen before and counted # $count{$unique_id}++; ## then add to its count # $skip_num=$num; ## and skip the IF line # } # else { ## if not seen before, then begin counting it # $count{$unique_id}=1; # } } ## NOT DOING THIS for making .phx and .spk files, keep entire dialog incl duplicates if ($db_line =~ m/^(\d+\.\d+\.\d+).*\s+IF\s+Prv\s+.*\s+([ac])\:/) { $comp_num=$1; $spk =$2; $comp_num =~ m/(\d+)\.(\d+)\.(\d+)/; $num_split="$1 $2 $3"; #debug# print "NUM SPLIT $num_split\n\n"; $all_db_num_split{$unique_id}=$num_split; #debug# print "SPK FOUND: $spk\n"; $all_db_spk{$unique_id}=$spk; ## NOT DOING THIS for making .phx and .spk files, want to be aligned with database # if ( $db_line =~ m/^.*noise.*$/ ) { ## skip line if noise #debug# print ">>>>>SKIP NOISE<<<<<\n"; # &reset_items; # next DB; # } if ( $prv=~/ATR/ ) { ## skip utterances provided by ATR #debug# print ">>>>>SKIP ATR UTT<<<<<\n"; $count{$unique_id}--; &reset_items; next DB; } ## NOT DOING THIS for making .phx and .spk files, keep entire dialog incl duplicates # if ($comp_num==$skip_num) { ## skip IFline if utt seen before UNLESS it was seen for different speaker if ($spk=~/c/) { # $cl_count{$unique_id}++; # count it, whether it will be skipped or not # if (!$cl_first_seen{$unique_id}) { # if utt not seen for client side yet &set_client_first; # then don't skip it # } } elsif ($spk=~/a/) { # $ag_count{$unique_id}++; # count it, whether it will be skipped or not # if (!$ag_first_seen{$unique_id}) { # if utt not seen for agent side yet &set_agent_first; # then don't skip it # } } ## NOT SKIPPING #debug# # print ">>>>>SKIP DUPLICATE UTT<<<<<\n"; # &reset_items; next DB; ## NOT USING SKIP_NUM IF STMT ABOVE # } ## NOT INITIALIZING SINCE NOT SKIPPING ## if not skipping, then this is first time utt seen for either client or agent ## so initialize counts and hashes here # if ($spk=~/c/) { # $cl_count{$unique_id}=1; # &set_client_first; # } # elsif ($spk=~/a/) { # $ag_count{$unique_id}=1; # &set_agent_first; # } } } foreach $key (sort keys %client) { ## print only for client utts print OUT "$cl_num_split{$key} $client{$key}\n"; print OUT2 "c\n"; } foreach $key (sort keys %agent) { ## print only for agent utts print OUT3 "$ag_num_split{$key} $agent{$key}\n"; print OUT4 "a\n"; } foreach $key (sort keys %all_db) { ## print for all utts print OUT5 "$all_db_num_split{$key} $all_db{$key}\n"; print OUT6 "$all_db_num_split{$key} $all_db_spk{$key}\n"; } close(OUT, $outfn); close(OUT2, $out2fn); close(OUT3, $out3fn); close(OUT4, $out4fn); close(OUT5, $out5fn); close(OUT6, $out6fn); #---------------------------------------------------------- # END MAIN #---------------------------------------------------------- sub reset_items { $num=""; $prv=""; $utt=""; $orig_utt=""; $skip_num=""; $comp_num=""; $spk=""; } sub write_db_items { $all_db{$unique_id}=$orig_utt; $all_db_if{$unique_id}=$db_line; $all_db_num{$unique_id}=$num_split; $all_db_prv{$unique_id}=$prv; $all_db_spk{$unique_id}=$local_spk; } ## USE THIS SUBROUTINE BUT DO NOT SET $cl_first_seen sub set_client_first { # $cl_first_seen{$unique_id}=1; $client{$unique_id}=$orig_utt; ## hash of client utts only $cl_if{$unique_id}=$db_line; ## hash of IFs for each client utt $cl_num{$unique_id}=$num; ## hash of numbers for each client utt $cl_prv{$unique_id}=$prv; ## hash of providers for each client utt $num =~ m/(\d+)\.(\d+)\.(\d+)/; $num_split="$1 $2 $3"; #debug# print "NUM SPLIT $num_split\n\n"; $cl_num_split{$unique_id}=$num_split; $local_spk="c"; # &write_db_items; &reset_items; next DB; } ## USE THIS SUBROUTINE BUT DO NOT SET $ag_first_seen sub set_agent_first { # $ag_first_seen{$unique_id}=1; $agent{$unique_id}=$orig_utt; ## hash of agent utts only $ag_if{$unique_id}=$db_line; ## hash of IFs for each agent utt $ag_num{$unique_id}=$num; ## hash of numbers for each agent utt $ag_prv{$unique_id}=$prv; ## hash of providers for each agent utt $num =~ m/(\d+)\.(\d+)\.(\d+)/; $num_split="$1 $2 $3"; #debug# print "NUM SPLIT $num_split\n\n"; $ag_num_split{$unique_id}=$num_split; $local_spk="a"; # &write_db_items; &reset_items; next DB; }