#!/usr/local/bin/perl5

# INPUT: one database file, followed by two 3-letter lang codes, corresponding to db codes (ENG, SPA, CAT)
#        first lang code is for the olang, second lang code is for the trans lang
#        note: only the "trans lang" may have multiple lines per SDU number


my $dbfile = shift || die("You must specify a db file.\n");

my $olang = shift;
my $trans = shift;

my $a_spk = "a";
my $c_spk = "c";

my $out1fn = ">$dbfile.agent-trans-$trans.goes-with-olang-$olang.txt";
open(OUT1, $out1fn);

my $out2fn = ">$dbfile.client-trans-$trans.goes-with-olang-$olang.txt";
open(OUT2, $out2fn);

my $out3fn = ">$dbfile.agent-olang-$olang.goes-with-trans-$trans.txt";
open(OUT3, $out3fn);

my $out4fn = ">$dbfile.client-olang-$olang.goes-with-trans-$trans.txt";
open(OUT4, $out4fn);

my $out5fn = ">$dbfile.agent-olang-$olang.goes-with-trans-$trans.txt.blank-lines";
open(OUT5, $out5fn);

my $out6fn = ">$dbfile.client-olang-$olang.goes-with-trans-$trans.txt.blank-lines";
open(OUT6, $out6fn);

open (DBF, "$dbfile") || die("$dbfile does not exist.\n");
my @db_lines = <DBF>;

print "\nInput File: \n>$dbfile\n";

print "\nOutput File: \n$out1fn\n";
print "\nOutput File: \n$out2fn\n";
print "\nOutput File: \n$out3fn\n";
print "\nOutput File: \n$out4fn\n\n";
print "\nOutput File: \n$out5fn\n\n";
print "\nOutput File: \n$out6fn\n\n";

close(DBF);

$sdu_num=0;
$prev_sdu_num=0;

my %sdus;

$ifcount=0;
$a_ifcount=0;
$c_ifcount=0;

my @a_line_num;
my @c_line_num;

$a_trans=0;
$c_trans=0;
$a_olang=0;
$c_olang=0;

HASH: foreach $db_line (@db_lines) {
    if ( $db_line =~ m/^(\d+\.\d+\.\d+).*$/ ) {
	$sdu_num = $1;
	if ( $sdu_num eq $prev_sdu_num ||
	    $prev_sdu_num eq "0" ) {
	    push (@sdus, $db_line);
	    $prev_sdu_num = $1;
	}
	else {
	    $sdus{$prev_sdu_num} = [ @sdus ];
	    @sdus=();
	    push (@sdus, $db_line);
	    $prev_sdu_num = $1;
	}
    }
}
$sdus{$prev_sdu_num} = [ @sdus ];
@sdus=();


COUNT: foreach $db_line (@db_lines) {
    if ( $db_line =~ m/^.*\:\s*$/ ||
	$db_line =~ m/^.*empty.*$/ ||
	$db_line =~ m/^.*no\-tag.*$/ ||
	$db_line =~ m/^.*noise.*$/ ||
	$db_line =~ m/^.*descriptive.*$/ ||
	$db_line =~ m/^.*IF2.*$/ ) { # don't count this type of input
	next COUNT;
    }

    if ( $db_line =~ m/^(\d+\.\d+\.\d+).*IF\s\sPrv.*$/ ) {
	$ifcount += 1;
    }

    if ( $db_line =~ m/^(\d+\.\d+\.\d+).*a\:(.*)\s*$/ ) {
	$a_ifcount += 1;
	push (@a_line_num, $1);
    }
    elsif ( $db_line =~ m/^(\d+\.\d+\.\d+).*c\:(.*)\s*$/ ) {
	$c_ifcount += 1;
	push (@c_line_num, $1);
    }
    else {
	next COUNT;
    }

}

SORT_A: foreach $a_line (@a_line_num) {
    sort_lines ($a_spk, $a_line);
}

SORT_C: foreach $c_line (@c_line_num) {
    sort_lines ($c_spk, $c_line);
}

print "\n total IF count: $ifcount\n";
print " agent IF count: $a_ifcount\n";
print "client IF count: $c_ifcount\n\n";

if ( $a_olang ne $a_trans ) {
    print "\nWARNING: $a_olang lines in a:olang file does not match $a_trans lines in a:trans file\n\n";
}

if ( $c_olang ne $c_trans ) {
    print "\nWARNING: $c_olang lines in c:olang file does not match $c_trans lines in c:trans file\n\n";
}


# END MAIN
#---------------------------------------------------

sub member {
  my $item = shift;
  my @list = @_;
  
  for (my $i = 0; $i < $#list + 1; $i ++) {
    if ($item eq $list[$i]) {
      return 1;
    }
  }
  return 0;
}

sub length_of_array {
  my @array = @_;
  
  return $#array + 1;
}


# SORT_A: foreach $a_line (@a_line_num) {
#     sort_lines ($a_spk, $a_line);
#     }

sub sort_lines {
    my $spk = shift;
    my $line = shift;
    $line_count=0;

    foreach $db_line ( @{ $sdus{$line} } ) {

	if ( $db_line =~ m/^.*\s\slang\s$olang.*\"(.*)\".*$/ ) {
	    $olang_line = $1;
	    $olang_line =~ s/\<//g;
	    $olang_line =~ s/\>//g;
	    if ( $spk eq "a" ) {
		print OUT3 "$olang_line\n";
		print OUT5 "\n";
		$a_olang += 1;
	    }
	    elsif ( $spk eq "c" ) {
		print OUT4 "$olang_line\n";
		print OUT6 "\n";
		$c_olang += 1;
	    }
	}
	if ( $db_line =~ m/^.*\s\slang\s$trans.*\"(.*)\".*$/ ) {
	    if ( $spk eq "a" ) {
		print OUT1 "$1\n";
		$a_trans += 1;
	    }
	    elsif ( $spk eq "c" ) {
		print OUT2 "$1\n";
		$c_trans += 1;
	    }
	    $trans_line = $1;
	    if ( $trans_line eq "" ) {
		print "\nWARNING: trans $line is blank line\n\n";
	    }
	    $line_count += 1;
	    if ( $line_count > 1 ) {
		if ( $spk eq "a" ) {
		    print OUT3 "$olang_line\n";
		    print OUT5 "\n";
		    $a_olang += 1;
		}
		elsif ( $spk eq "c" ) {
		    print OUT4 "$olang_line\n";
		    print OUT6 "\n";
		    $c_olang += 1;
		}
	    }
	}
    }
}

