#! /usr/local/bin/perl5 -w
use strict;

# (18May2003, Chad Langley)
# This script extracts random samples of DAs from DB files and counts the
# number of DAs seen in each sample.  The DB files from which to extract
# samples, the sizes of samples from each DB file, and the number of samples
# per size and file can all be specified on the command line.  The seed for
# the random number generator can also be specified in order to allow for
# replication of a particular data set.  If the seed is unspecified, the time
# is used.

#------------------------------------------------------------------------------
# DEFAULTS
#------------------------------------------------------------------------------

# Seed for random number generator
my $seed = time;

# Number of folds for cross validation
my $samples = 10;

# Sizes of samples
my $sample_sizes = 500;

# Input directory
my $input_dir = ".";

# Output directory
my $output_dir = "";

# File containing complete data set
my $dbfiles = "";

# Base file name for sample data
my $sample_file = "";

# Format output for Excel?
my $excel = 0;

#------------------------------------------------------------------------------
# SHOW USAGE INFORMATION
#------------------------------------------------------------------------------
sub show_usage {
    print "
usage: $0 {options}

options:

  -seed <int>            ; Seed for random number generator
  -samples <int>         ; Number of samples per file and sample size
  -sample_sizes <list>   ; Comma-separated list of sample sizes

  -input_dir <path>      ; Directory containing input DB files
  -output_dir <path>     ; Directory in which to store output files
  -dbfiles <strings>     ; Comma-separated list of DB file names

  -excel <0|1>           ; Format sample output for Excel and save to file?
  -sample_file <string>  ; Base file name for output sample files

\n";
}

#------------------------------------------------------------------------------
# PROCESS COMMAND LINE ARGUMENTS
#------------------------------------------------------------------------------
while (@ARGV)
{
    if ($ARGV[0] eq '-h' || $ARGV[0] eq '--help' || $ARGV[0] eq '-help') {
	&show_usage();
	die("Done.\n");
    }
    elsif ($ARGV[0] eq '-seed') {
	shift;
	$seed = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-samples') {
	shift;
	$samples = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-sample_sizes') {
	shift;
	$sample_sizes = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-input_dir') {
	shift;
	$input_dir = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-output_dir') {
	shift;
	$output_dir = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-dbfiles') {
	shift;
	$dbfiles = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-excel') {
	shift;
	$excel = $ARGV[0];
	shift;	
    }
    elsif ($ARGV[0] eq '-sample_file') {
	shift;
	$sample_file = $ARGV[0];
	shift;	
    }
    else {
	warn "\n ... unknown option $ARGV[0] ...\n";
	shift;
    }
}

#------------------------------------------------------------------------------
# SUBROUTINES
#------------------------------------------------------------------------------
# Return an array with a (pseudo-)random sample of size N of elements from
# the input array.
sub pseudo_random_sample {
    my $n = shift(@_);
    my @in_array = @_;
    my @out_array;

    for my $i ( 1 .. $n ) {
	my $index = int(rand(@in_array));
	push(@out_array, splice(@in_array, $index, 1));
    }

    return(@out_array);
}

# Return the mean of values in an array
sub compute_mean {
    my @values = @_;
    my $n = scalar(@values);
    my $mean = 0.0;

    if ($n) {
	foreach my $value (@values) {
	    $mean += $value;
	}
	$mean /= $n;
    }

    return($mean);
}

# Return the standard deviation of numbers in an array
sub compute_std_dev {
    my @values = @_;
    my $n = scalar(@values);
    my $mean = &compute_mean(@values);

    my $variance = 0.0;
    foreach my $value (@values) {
	$variance += ($value - $mean) * ($value - $mean);
    }
    $variance /= ($n - 1);

    my $std_dev = sqrt($variance);

    return($std_dev)
}

#------------------------------------------------------------------------------
# MAIN SCRIPT
#------------------------------------------------------------------------------
# Verify that at least one DB file was specified.
if (!$dbfiles) {
    die("You must specify at least one DB file.\n");
}

# List of DB files
my @dbfiles = split(/,/, $dbfiles);

# List of sample sizes
my @sample_sizes = split(/,/, $sample_sizes);

# Create a name for the sample output file if not specified
if ($excel && !$sample_file) {
    $sample_file = join("+", @dbfiles);
}

# Create an output path if not specified
if (!$output_dir) {
    $output_dir = "$input_dir/samples";
}

# Print settings
print STDOUT "      Seed: $seed\n";
print STDOUT "   Samples: $samples\n";
print STDOUT " SDUs/file: $sample_sizes\n";
print STDOUT " Input Dir: $input_dir\n";
print STDOUT "Output Dir: $output_dir\n";

# Initialize the random number generator
srand($seed);

# Table of DAs in DB files
my %DAs;
my %SAs;
my %CSs;

# Tables of DA counts in DB files
my %DA_counts;
my %DAs_per_db;
my %SA_counts;
my %SAs_per_db;
my %CS_counts;
my %CSs_per_db;

# Read DAs from each DB file
print STDOUT "\nFiles:\n";

foreach my $file (@dbfiles) {
    open(FILE, "$input_dir/$file")
	|| die("ERROR: Unable to open file $input_dir/$file\n  $!\n");
    open(DAFILE, ">$output_dir/$file.DAs")
	|| die("ERROR: Unable to open DA file $output_dir/$file.DAs\n  $!\n");
    open(SAFILE, ">$output_dir/$file.SAs")
	|| die("ERROR: Unable to open SA file $output_dir/$file.SAs\n  $!\n");
    open(CSFILE, ">$output_dir/$file.CSs")
	|| die("ERROR: Unable to open CS file $output_dir/$file.CSs\n  $!\n");
    while (defined(my $line = <FILE>)) {
	next if $line !~ m/IF\s+Prv/o; #Skip non-IF lines
	next if $line =~ m/^[^.]+\.[^.]\.0\s+/o; #Skip comment lines
	next if $line =~ m/^.*\:\s*$/o; #Skip untagged lines
	next if $line =~ m/^.*empty.*$/o; #Skip 'empty' tags
	next if $line =~ m/^.*noise.*$/o; #Skip 'noise' tags
	next if $line =~ m/^.*no\-tag.*$/o; #Skip 'no-tag' tags
	next if $line =~ m/^.*descriptive.*$/o; #Skip 'descriptive' tags

	chomp($line);
	$line =~ s/^.*IF\s+Prv\s+\S+\s+[ac]://o; #Remove anthing before the DA
	$line =~ s/\s.*//o; #Remove anything after the DA

	# Separate DA,SA,CS
	my $da = $line;
	my $sa = $line;
	$sa =~ s/\+.*$//o;
	my $cs = $line;
	if ($cs =~ m/\+/o) {
	    $cs =~ s/^[^+]*//o;
	}
	else {
	    $cs = "NO_CONCEPTS";
	}

	push(@{$DAs{$file}}, $da);
	push(@{$SAs{$file}}, $sa);
	push(@{$CSs{$file}}, $cs);

	print DAFILE "$da\n";
	print SAFILE "$sa\n";
	print CSFILE "$cs\n";

	# Increment DA counts
	if (!defined($DA_counts{$da}{$file})) {
	    $DA_counts{$da}{$file} = 1;
	    $DAs_per_db{$file} += 1.0;
	}
	else {
	    $DA_counts{$da}{$file} += 1;
	}

	# Increment SA counts
	if (!defined($SA_counts{$sa}{$file})) {
	    $SA_counts{$sa}{$file} = 1;
	    $SAs_per_db{$file} += 1.0;
	}
	else {
	    $SA_counts{$sa}{$file} += 1;
	}

	# Increment CS counts
	if (!defined($CS_counts{$cs}{$file})) {
	    $CS_counts{$cs}{$file} = 1;
	    $CSs_per_db{$file} += 1.0;
	}
	else {
	    $CS_counts{$cs}{$file} += 1;
	}
    }
    close(FILE);
    close(DAFILE);
    close(SAFILE);
    close(CSFILE);
    print STDOUT " $file\n";
    print STDOUT sprintf("  SDUs: %d\n", scalar(@{$DAs{$file}}));
    print STDOUT sprintf("   DAs: %d\n", $DAs_per_db{$file});
    print STDOUT sprintf("   SAs: %d\n", $SAs_per_db{$file});
    print STDOUT sprintf("   CSs: %d\n", $CSs_per_db{$file});
}
print STDOUT "\n";

# Compute overlap among DB files
my @all_DAs = sort(keys(%DA_counts));
my @all_SAs = sort(keys(%SA_counts));
my @all_CSs = sort(keys(%CS_counts));

print STDOUT " Combined DB:\n";
print STDOUT sprintf("   DAs: %d\n", scalar(@all_DAs));
print STDOUT sprintf("   SAs: %d\n", scalar(@all_SAs));
print STDOUT sprintf("   CSs: %d\n", scalar(@all_CSs));
print STDOUT "\n";

my @overlap_DAs;
my @overlap_SAs;
my @overlap_CSs;

DA: foreach my $da (@all_DAs) {
    foreach my $file (@dbfiles) {
	if (!defined($DA_counts{$da}{$file})) {
	    next DA;
	}
    }
    push(@overlap_DAs, $da);
}
open(OVERLAP_DAS, ">$output_dir/$sample_file.overlap-DAs")
    || die("ERROR: Unable to open DA overlap file $output_dir/$sample_file.overlap-DAs\n  $!\n");
foreach my $da (@overlap_DAs) {
    print OVERLAP_DAS "$da\n";
}
close(OVERLAP_DAS);

SA: foreach my $sa (@all_SAs) {
    foreach my $file (@dbfiles) {
	if (!defined($SA_counts{$sa}{$file})) {
	    next SA;
	}
    }
    push(@overlap_SAs, $sa);
}
open(OVERLAP_SAS, ">$output_dir/$sample_file.overlap-SAs")
    || die("ERROR: Unable to open SA overlap file $output_dir/$sample_file.overlap-SAs\n  $!\n");
foreach my $sa (@overlap_SAs) {
    print OVERLAP_SAS "$sa\n";
}
close(OVERLAP_SAS);

CS: foreach my $cs (@all_CSs) {
    foreach my $file (@dbfiles) {
	if (!defined($CS_counts{$cs}{$file})) {
	    next CS;
	}
    }
    push(@overlap_CSs, $cs);
}
open(OVERLAP_CSS, ">$output_dir/$sample_file.overlap-CSs")
    || die("ERROR: Unable to open CS overlap file $output_dir/$sample_file.overlap-CSs\n  $!\n");
foreach my $cs (@overlap_CSs) {
    print OVERLAP_CSS "$cs\n";
}
close(OVERLAP_CSS);

print STDOUT "DB Overlap:\n";
print STDOUT sprintf(" DAs: %d\n", scalar(@overlap_DAs));
print STDOUT sprintf(" SAs: %d\n", scalar(@overlap_SAs));
print STDOUT sprintf(" CSs: %d\n", scalar(@overlap_CSs));

print STDOUT "Coverage of Overlap Set:\n";
foreach my $file (@dbfiles) {
    print STDOUT " $file\n";

    my $da_overlap_sdus = 0.0;
    my $sa_overlap_sdus = 0.0;
    my $cs_overlap_sdus = 0.0;

    foreach my $da (@overlap_DAs) {
	$da_overlap_sdus += $DA_counts{$da}{$file};
    }
    foreach my $sa (@overlap_SAs) {
	$sa_overlap_sdus += $SA_counts{$sa}{$file};
    }
    foreach my $cs (@overlap_CSs) {
	$cs_overlap_sdus += $CS_counts{$cs}{$file};
    }

    print STDOUT sprintf("  DAs: %4.2f%% (%d/%d)\n",
			 100.0 * ($da_overlap_sdus / scalar(@{$DAs{$file}})),
			 $da_overlap_sdus,
			 scalar(@{$DAs{$file}})
			);
    print STDOUT sprintf("  SAs: %4.2f%% (%d/%d)\n",
			 100.0 * ($sa_overlap_sdus / scalar(@{$SAs{$file}})),
			 $sa_overlap_sdus,
			 scalar(@{$SAs{$file}})
			);
    print STDOUT sprintf("  CSs: %4.2f%% (%d/%d)\n",
			 100.0 * ($cs_overlap_sdus / scalar(@{$CSs{$file}})),
			 $cs_overlap_sdus,
			 scalar(@{$CSs{$file}})
			);
}
print STDOUT "\n";

# Verify that all files can accommodate all sample sizes.
foreach my $size (@sample_sizes) {
    foreach my $file (@dbfiles) {
	my $file_sdus = scalar(@{$DAs{$file}});
	if ($size > $file_sdus) {
	    die("$file\ncontains only $file_sdus SDUs.  Can't extract sample of size $size.\n");
	}
    }
}


if ($excel) {
    open(DA_SAMPLE_FILE, ">$output_dir/$sample_file.da-samples")
	|| die("ERROR: Unable to open sample output file $output_dir/$sample_file.da-samples\n  $!\n");
    open(SA_SAMPLE_FILE, ">$output_dir/$sample_file.sa-samples")
	|| die("ERROR: Unable to open sample output file $output_dir/$sample_file.sa-samples\n  $!\n");
    open(CS_SAMPLE_FILE, ">$output_dir/$sample_file.cs-samples")
	|| die("ERROR: Unable to open sample output file $output_dir/$sample_file.cs-samples\n  $!\n");

    print DA_SAMPLE_FILE "Sample Size";
    print SA_SAMPLE_FILE "Sample Size";
    print CS_SAMPLE_FILE "Sample Size";

    for my $i ( 1 .. $samples ) {
	print DA_SAMPLE_FILE ";";
	print SA_SAMPLE_FILE ";";
	print CS_SAMPLE_FILE ";";
    }

    print DA_SAMPLE_FILE ";Mean;StdDev";
    print SA_SAMPLE_FILE ";Mean;StdDev";
    print CS_SAMPLE_FILE ";Mean;StdDev";

    print DA_SAMPLE_FILE "\n";
    print SA_SAMPLE_FILE "\n";
    print CS_SAMPLE_FILE "\n";
}

foreach my $size (@sample_sizes) {
    my $sample_size = $size * scalar(@dbfiles);
    if ($excel) {
	print DA_SAMPLE_FILE "$sample_size";
	print SA_SAMPLE_FILE "$sample_size";
	print CS_SAMPLE_FILE "$sample_size";
    }
    else {
	print STDOUT "Sample Size: $sample_size\n";
    }

    my @uniq_DAs = ();
    my @uniq_SAs = ();
    my @uniq_CSs = ();

    for my $sample ( 1 .. $samples ) {
	my %sample_DA_counts = ();
	my %sample_SA_counts = ();
	my %sample_CS_counts = ();
	foreach my $file (@dbfiles) {
	    my @sample_DAs = &pseudo_random_sample($size,@{$DAs{$file}});
	    foreach my $da (@sample_DAs) {
		if (!defined($sample_DA_counts{$da})) {
		    $sample_DA_counts{$da} = 1;
		}
		else {
		    $sample_DA_counts{$da} += 1;
		}
	    }
	    my @sample_SAs = &pseudo_random_sample($size,@{$SAs{$file}});
	    foreach my $sa (@sample_SAs) {
		if (!defined($sample_SA_counts{$sa})) {
		    $sample_SA_counts{$sa} = 1;
		}
		else {
		    $sample_SA_counts{$sa} += 1;
		}
	    }
	    my @sample_CSs = &pseudo_random_sample($size,@{$CSs{$file}});
	    foreach my $cs (@sample_CSs) {
		if (!defined($sample_CS_counts{$cs})) {
		    $sample_CS_counts{$cs} = 1;
		}
		else {
		    $sample_CS_counts{$cs} += 1;
		}
	    }
	}
	push(@uniq_DAs, scalar(keys(%sample_DA_counts)));
	push(@uniq_SAs, scalar(keys(%sample_SA_counts)));
	push(@uniq_CSs, scalar(keys(%sample_CS_counts)));
    }
    if ($excel) {
	print DA_SAMPLE_FILE ";", join(";", @uniq_DAs);
	print DA_SAMPLE_FILE ";", &compute_mean(@uniq_DAs);
	print DA_SAMPLE_FILE ";", &compute_std_dev(@uniq_DAs);
	print DA_SAMPLE_FILE "\n";

	print SA_SAMPLE_FILE ";", join(";", @uniq_SAs);
	print SA_SAMPLE_FILE ";", &compute_mean(@uniq_SAs);
	print SA_SAMPLE_FILE ";", &compute_std_dev(@uniq_SAs);
	print SA_SAMPLE_FILE "\n";

	print CS_SAMPLE_FILE ";", join(";", @uniq_CSs);
	print CS_SAMPLE_FILE ";", &compute_mean(@uniq_CSs);
	print CS_SAMPLE_FILE ";", &compute_std_dev(@uniq_CSs);
	print CS_SAMPLE_FILE "\n";
    }
    else {
	print STDOUT "DAs:\n";
	print STDOUT sprintf("Mean: %.2f\n", &compute_mean(@uniq_DAs));
	print STDOUT sprintf("StdDev: %.2f\n", &compute_std_dev(@uniq_DAs));
	print STDOUT join("\n", @uniq_DAs), "\n";
	print STDOUT "\n";

	print STDOUT "SAs:\n";
	print STDOUT sprintf("Mean: %.2f\n", &compute_mean(@uniq_SAs));
	print STDOUT sprintf("StdDev: %.2f\n", &compute_std_dev(@uniq_SAs));
	print STDOUT join("\n", @uniq_SAs), "\n";
	print STDOUT "\n";

	print STDOUT "CSs:\n";
	print STDOUT sprintf("Mean: %.2f\n", &compute_mean(@uniq_CSs));
	print STDOUT sprintf("StdDev: %.2f\n", &compute_std_dev(@uniq_CSs));
	print STDOUT join("\n", @uniq_CSs), "\n";
	print STDOUT "\n";
    }
}

if ($excel) {
    close(DA_SAMPLE_FILE);
    close(SA_SAMPLE_FILE);
    close(CS_SAMPLE_FILE);
}
