#!/usr/local/bin/perl5 -w

# This script combines data files of the form:
# description: <(list of) numbers>
# One number is assumes to be count
# Two numbers is assumes to be a mean and stdev
#             (count is assumed to be the first count in the file)
# Three numbers is assumed to ba count, a mean, and a stdev
# Four numbers is assumed to be a count, a mean, a variance, a stdev
# Five numbers is assumes to be a count, a mean, a stdev, a mean, a stdev

use Statistics::Descriptive;

#&test_update_stdev; die "testing";

# hash of names to list references
%valhash = ();
@keys = ();

$usage = "$0 <list of data files>";

die $usage if (@ARGV == 0);

$num_files = @ARGV;

foreach $fn (@ARGV)
  {
    open (FH, "<$fn") || die "Could not open file '$fn': $!";
    $filecount = undef;
    while (<FH>)
      {
	chomp;
	s/\#.*//;
	next if /^\s*$/;
	if (not /^(.*?)\s*\:\s*(.*)$/)
	  {
	    warn "Did not understand line '$_' of file '$fn'";
	    next;
	  }
	$name = $1;
	@vals = split /\s+/, $2;

        # just cut out the variance if we get four values
        if (@vals == 4)
          {
            @vals = @vals[0..1,3];
          }

        # If there are any lines with counts, then $filecount stores the count of the first
        # entry in the file to use for any entry lacking a count
	if (!defined($filecount) && 
	    (@vals == 1 || @vals == 3 || @vals == 5))
	  {
	    $filecount = $vals[0];
	  }
	if (@vals == 2)
	  {
	    if (defined($filecount))
	      {
		unshift @vals, $filecount;
	      }
	    else
	      {
		die "Got a two element line '$_' before setting filecount in '$fn'";
	      }
	  }
	if (exists $valhash{$name})
	  {
	    $listref = $valhash{$name};
	    die "Size mismatch on '$_' in '$fn'" if (@vals != @$listref);
	    # update the mean
	    $listref->[1] = ($listref->[0]*$listref->[1] + $vals[0]*$vals[1]) / ($listref->[0] + $vals[0]) if (@vals >= 2 && ($listref->[0] + $vals[0]) != 0);
	    # update the stdev
	    $listref->[2] = update_stdev($listref->[0], $listref->[1], $listref->[2],
                                         $vals[0], $vals[1], $vals[2])
              if (@vals >= 3);
	    # update the second mean
	    $listref->[3] = ($listref->[0]*$listref->[3] + $vals[0]*$vals[3]) / ($listref->[0] + $vals[0]) if (@vals >= 4 && $listref->[0] + $vals[0] != 0);
	    # update the second stdev
	    $listref->[4] = update_stdev($listref->[0], $listref->[3], $listref->[4],
                                         $vals[0], $vals[3], $vals[4])
              if (@vals >= 5);
	    # update the count
	    $listref->[0] = $listref->[0] + $vals[0];
	  }
	else
	  {
	    $valhash{$name} = [ @vals ];
	    push @keys, $name;
	  }
      }
    close(FH);
  }


# Now print out the results
$filelist = join ' ', @ARGV;
$pwd = `pwd`;
chomp $pwd;
print <<EOH;
# This file was generated by combine_datafiles.pl
# It was generated by this command line: $0 $filelist
# in this directory: $pwd
EOH
$totalcount = undef;
foreach $key (@keys)
  {
    $totalcount = $valhash{$key}->[0] if (!defined($totalcount));
    print $key . ": " . (join ' ', @{$valhash{$key}});
    #print "# % total = " . ($valhash{$key}->[0] / $totalcount) . "\n";
    print " # avg per file = " . (${$valhash{$key}}[0] / $num_files);
    print "\n";
  }


sub update_stdev
  {
    my ($n1, $mean1, $stdev1, $n2, $mean2, $stdev2) = @_;
    return $stdev2 if ($n1 == 0);
    return $stdev1 if ($n2 == 0);
    my ($sum1, $sum2) = ($n1 * $mean1, $n2 * $mean2);
    my ($sumsq1, $sumsq2) =
      ( ($stdev1**2 * $n1 * ($n1 -1) + $sum1**2) / $n1,
        ($stdev2**2 * $n2 * ($n2 -1) + $sum2**2) / $n2 );
    my ($n, $sum, $sumsq) = ($n1 + $n2, $sum1 + $sum2, $sumsq1 + $sumsq2);
    return sqrt(($n * $sumsq - $sum**2) / ( $n * ($n -1) ));
  }

sub test_update_stdev
  {
    my $len = 100;
    my $i = 0;
    my @vals = map { rand() * ++$i } ((0) x $len);
    my ($stdev_left, $stdev_right, $stdev_total);
    #print "SMURF: " . (join ' ', @vals) . "\n";
    for ($i = 10; $i < $len - 10; $i++)
      {
        #print "SMURF2: " . (join ' ', @vals[0..$i]) . "\n";
        $stat_left = Statistics::Descriptive::Sparse->new();
        $stat_left->add_data(@vals[0..$i]);
        $stat_right = Statistics::Descriptive::Sparse->new();
        $stat_right->add_data(@vals[$i+1..$len-1]);
        $stat = Statistics::Descriptive::Sparse->new();
        $stat->add_data(@vals);
        my ($stdev_left, $stdev_right, $stdev_all, $stdev_upd) = 
          ($stat_left->standard_deviation(),
           $stat_right->standard_deviation(),
           $stat->standard_deviation(),
           update_stdev($stat_left->count(), $stat_left->mean(), $stat_left->standard_deviation(),
                        $stat_right->count(), $stat_right->mean(), $stat_right->standard_deviation()));
        print "test: $stdev_left $stdev_right $stdev_all $stdev_upd\n";
      }
  }
