#!/usr/local/bin/perl5 -w
#
# Purpose:
#  To provide some basic statistics/access utils for arrays.
#
##################################################################
# $Id: ArrayFoo.pm,v 1.6 2002/09/19 10:47:13 lw2j Exp $
# $Log:	ArrayFoo.pm,v $
# Revision 1.6  2002/09/19  10:47:13  lw2j
# Now can compute covariance and correlation between two arrays.
# 
# Revision 1.5  2002/08/16  16:38:46  lw2j
# arrSelect now supports the
#    $foo[-$x]
# idiom, e.g.
#
# my $min = ArrayFoo::arrSelect($arr_ref, -1);
#
# to extract the minimum value.
#
# Revision 1.4  2002/08/12  17:56:28  lw2j
# Added missing $ in prototype for ArrDevWtd.
#
# Revision 1.3  2002/08/06  13:44:27  lw2j
# This is the version with arrMoment, et al.
#
# Revision 1.2  2002/06/26  15:07:17  lw2j
# Added weighted versions of mean and standard deviation
# computation, to support EM.
#
#
##################################################################
#
# Package Header
#
##################################################################

package ArrayFoo;
require Exporter;

##########################################################################

BEGIN {
  use Fcntl;
  use POSIX qw(tmpnam);
  use Symbol;

  use Exporter ();
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

  $VERSION     = 1.0;
  @ISA         = qw(Exporter);
  @EXPORT =  qw(
    arrPartition
    arrSplit
    arrSelect
    arrMedian
    arrMean
    arrMoment
    arrMomentWtd
    arrMeanWtd
    arrMode
    arrMAD
    arrDev
    arrDevWtd
    arrQuantize
    arrCorrelate
    arrEquiDepthMutual
    arrEquiWithMutual
    _arrMutualHelper
  );
  %EXPORT_TAGS = ();
  @EXPORT_OK   = ();

  return TRUE;
}



##################################################################
#
# Given a ref to an array and a partition element, split into
# less, equal and right; return three references.
#
sub arrPartition($$) {
  my ($arr_ref, $part_el) = (@_);
  my $less;
  my $equal;
  my $greater;
  my $len;
  my $idx;
  my $item;

  $less    = [];
  $equal   = [];
  $greater = [];
  $len     = scalar (@$arr_ref);

  for ($idx=0; $idx < $len; $idx++) {
    $item = $arr_ref->[$idx];
    if ($item < $part_el) {
      push(@$less, $item);
    } elsif ($item > $part_el) {
      push(@$greater, $item);
    } else {
      push(@$equal, $item);
    }
  }

  return ($less, $equal, $greater);
}



##################################################################
#
# Break an array (ref) into a set of arrays (ref) each no larger
# than $max in length.  Return a list of these refs.
#
sub arrSplit($$) {
  my ($orig_ref, $max) = @_;
  my @table;
  my $len;
  my $idx;
  my $vec_idx;
  my $this_idx;
  my @this_list;
  my $lists;

  @table     = ();
  @this_list = ();
  $len   = scalar (@$orig_ref);

  $lists = int(($len + $max - 1) / $max);

  $idx = 0;
  for ($vec_idx=0; $vec_idx < $lists; $vec_idx++) {
    @this_list = ();
    for ($this_idx=0; ($this_idx < $max) && ($idx < $len);
    $idx++, $this_idx++) {
      push(@this_list, $orig_ref->[$idx]);
    }
    push(@table, [@this_list]);
  }

  return @table;
}


##################################################################
#
# arrSelect -- pick a number which is the (k+1)th number in the
# sorted array
#

sub arrSelect($$) {
  my ($arr_ref, $k) = (@_);
  my $count;

  $count = scalar (@$arr_ref);

  if ($k < 0) {
    # Allow the Perl neg-index shorthand e.g.
    #
    # $min = ArrayFoo::arrSelect($rg_ref, -1);
    $k += $count;
  }

  if ($count < 6) {
    # don't split; just do a sort
    my @copy;

    @copy = sort { $a <=> $b } (@$arr_ref);

    return $copy[$k];
  }

  # We need to split into lists...
  my @lists;
  my @medians;
  my $list_idx;
  my $list_ct;

  # The 5 is arbitrary.
  @lists   = arrSplit($arr_ref, 5);
  @medians = ();
  $list_ct = scalar @lists;

  for ($list_idx=0; $list_idx < $list_ct; $list_idx++) {
    my @sorted;
    my $listref;
    my $ct;

    $listref = $lists[$list_idx];
    @sorted = sort { $a <=> $b } @$listref;
    $ct     = scalar @sorted;

    if ((int($ct/2)) == ($ct/2)) {
      # even.  oops.
      push(@medians, 0.5*($listref->[($ct/2)] +
      $listref->[(($ct-2)/2)]));
    } else {
      push(@medians, $listref->[($ct-1)/2]);
    }
  }

  # ok. now, we find the medians' median.
  my $MoM;

  my $lt_ref;
  my $eq_ref;
  my $gt_ref;

  $MoM = &arrSelect([@medians], int((scalar(@medians)-1)/2));
  ($lt_ref, $eq_ref, $gt_ref) = arrPartition($arr_ref, $MoM);

  my $lt_ct;
  my $eq_ct;
  my $gt_ct;

  $lt_ct = scalar(@$lt_ref);

  if ($k < $lt_ct) {
    return (&arrSelect($lt_ref, $k));
  }

  $k    -= $lt_ct;
  $eq_ct = scalar(@$eq_ref);

  if ($k < $eq_ct) {
    return $MoM;
  }

  return(&arrSelect($gt_ref, $k - $eq_ct));
}


##################################################################
#
# arrMedian -- quick wrapper
# do not use on zero-len arrays

sub arrMedian($) {
  my($arr_ref) = (@_);

  return(arrSelect($arr_ref, int((scalar(@$arr_ref) - 1)/2)));
}




##################################################################
#
# arrMAD -- compute mean absolute deviation
#           also returns the median if it's wantarrary() since
#           we need to compute it anyway
sub arrMAD($) {
  my $arr_ref = shift;
  my $ct      = scalar(@{$arr_ref});
  my $median  = arrMedian($arr_ref);
  my $devsum  = 0;
  my $MAD     = 0;

  map { $devsum += abs($median - $_) } @$arr_ref;
  $MAD = $devsum / $ct;

  if (wantarray()) {
    return ($MAD, $median);
  } else {
    return $MAD;
  }
}


##################################################################
#
# arrMean -- arithmetic mean

sub arrMean($) {
  my $arr_ref = shift;
  my $ct  = scalar @$arr_ref;
  my $sum = 0;

  map { $sum+=$_ } @$arr_ref;

  if ($ct == 0) {
    return undef;
  }

  return ($sum/$ct);
}



##################################################################
#
# arrMeanWtd -- arithmetic mean
# do not use on zero-len arrays
#
# This version accepts weights.
sub arrMeanWtd($$) {
  my $arr_ref = shift;
  my $wtd_ref = shift;
  my $ct  = scalar @$arr_ref;
  my $sum = 0;
  my $wts = 0;

  for (my $i=0; $i < $ct; $i++) {
    my $wt = $wtd_ref->[$i];
    my $x  = $arr_ref->[$i];

    if ($wt > 0) {
      $sum += $wt * $x;
      $wts += $wt;
    }
  }

  if ($wts == 0) {
    return undef;
  }

  return ($sum/$wts);
}




##################################################################
#
# arrMoment:  Compute arbitrary moments on array.

sub arrMoment($$) {
  my $arr_ref = shift;
  my $moment  = shift;
  my $ct  = scalar @$arr_ref;
  my $sum = 0;

  map { $sum+= ($_ ** $moment) } @$arr_ref;

  if ($ct == 0) {
    return undef;
  }

  return ($sum/$ct);
}



##################################################################
#
# arrMomentWtd -- Moments
#
# This version accepts weights.
sub arrMomentWtd($$$) {
  my $arr_ref = shift;
  my $moment  = shift;
  my $wtd_ref = shift;
  my $ct  = scalar @$arr_ref;
  my $sum = 0;
  my $wts = 0;

  for (my $i=0; $i < $ct; $i++) {
    my $wt = $wtd_ref->[$i];
    my $x  = $arr_ref->[$i];

    if ($wt > 0) {
      $sum += $wt * ($x ** $moment);
      $wts += $wt;
    }
  }

  if ($wts == 0) {
    return undef;
  }

  return ($sum/$wts);
}



##################################################################
#
# arrMode -- hash table approach
# also takes refs, also assumes non-zero array

sub arrMode($) {
  my($arr_ref) = (@_);
  my($ct);
  my($idx);
  my(%counts);
  my(@list);

  %counts = ();
  $ct     = scalar (@$arr_ref);

  for ($idx=0; $idx < $ct; $idx++) {
    my $orig = $counts{$arr_ref->[$idx]};

    if (!defined($orig)) {
      $counts{$arr_ref->[$idx]} = 1;
    } else {
      $counts{$arr_ref->[$idx]} = $orig+1;
    }
  }

  @list = keys %counts;
  $ct   = scalar @list;

  my $best_key;
  my $best_ct;

  $best_key = $list[0];
  $best_ct  = $counts{$best_key};

  for ($idx=1; $idx < $ct; $idx++) {
    if ($counts{$list[$idx]} > $best_ct) {
      $best_key = $list[$idx];
      $best_ct  = $counts{$best_key};
    }
  }

  return $best_key;
}


##################################################################
#
# arrDev -- find standard deviation
# also takes refs, also assumes non-zero array

# sigma^{2} = E[x^2] - E[x]^2
sub arrDev($) {
  my $arr_ref = shift;
  my $ct      = scalar @$arr_ref;
  my $sum1    = 0;
  my $sum2    = 0;


  for (my $idx=0; $idx < $ct; $idx++) {
    my $val = $arr_ref->[$idx];
    $sum1 += $val;
    $sum2 += $val*$val;
  }

  my $expected_x1 = $sum1/$ct;
  my $expected_x2 = $sum2/$ct;

  my $variance  = $expected_x2 - ($expected_x1 ** 2);

  if ($variance < 0) {
    # Probably a precision problem relating to tiny numbers.
    # BUGBUG.
    $variance = 0;
  }

  my $deviation = sqrt($variance);

  return $deviation;
}


##################################################################
#
# arrDevWtd -- Weighted version.
# also takes refs, also assumes non-zero array

# sigma^{2} = E[x^2] - E[x]^2
sub arrDevWtd($$) {
  my $arr_ref = shift;
  my $wtd_ref = shift;
  my $ct      = scalar @$arr_ref;
  my $sum1    = 0;
  my $sum2    = 0;
  my $wtsum   = 0;

  for (my $idx=0; $idx < $ct; $idx++) {
    my $val = $arr_ref->[$idx];
    my $wt  = $wtd_ref->[$idx];

    if ($wt > 0) {
      $sum1 += $val*$wt;
      $sum2 += $val*$val*$wt;

      $wtsum += $wt;
    }
  }

  if ($wtsum == 0) {
    return undef;
  }

  my $expected_x1 = $sum1/$wtsum;
  my $expected_x2 = $sum2/$wtsum;

  my $variance  = $expected_x2 - ($expected_x1 ** 2);

  if ($variance < 0) {
    # Tiny number hack.  BUGBUG...
    $variance = 0;
  }

  my $deviation = sqrt($variance);

  return $deviation;
}



##################################################################
#
# arrQuantize -- Take an array reference and a return a reference
# to a new array corresponding to a quantile transformation of
# the array.
#
# That is, for each item X in the original array, the new array
# has an item Y that corresponds to the percentile [0-1], where
# Y=foo% means that X is strictly greater than foo% of the
# other items in the original array.

sub arrQuantize($) {
  my $arrRef = shift;
  my $newRef = +[];

  defined($arrRef) || return undef;

  my @sorted_arr = sort { $a <=> $b } @{$arrRef};
  my %table      = ();
  my $count      = scalar @sorted_arr;

  fill_table:  {
    my $idx    = 0;
    my $val    = 0;

    for ($idx=0; $idx < $count; $idx++) {
      $val = $sorted_arr[$idx];

      # No overwrites, since we use a strictly-greater-than
      # rule.
      if (!exists($table{$val})) {
        $table{$val} = $idx / $count;
      }
    }
  }

  fill_array:  {
    my $idx = 0;

    for ($idx=0; $idx < $count; $idx++) {
      push @{$newRef}, $table{$arrRef->[$idx]};
    }
  }

  return $newRef;
}



# Given two references to array lists of even length, compute their
# correlation.
#
# Cov = E[XY] - E[X]E[Y]
# COR = Cov(X,Y) / (dev(X) * dev(Y))
#
# We return the correlation value, unless the arrays are of different
# size.  If wantarray(), we also return the covariance.
sub arrCorrelate($$) {
  my $xref = shift;
  my $yref = shift;

  my $ct = scalar (@$xref);

  if ((scalar @$yref) != $ct) {
    return undef;
  }

  my $x_sum   = 0;
  my $y_sum   = 0;
  my $x2_sum  = 0;
  my $y2_sum  = 0;
  my $xy_sum  = 0;

  for (my $i=0; $i < $ct; $i++) {
    my $x = $xref->[$i];
    my $y = $yref->[$i];

    defined($x) || return undef;
    defined($y) || return undef;

    $x_sum += $x;
    $y_sum += $y;

    $x2_sum += $x**2;
    $y2_sum += $y**2;
    $xy_sum += $x * $y;
  }

  my $e_x    = $x_sum / $ct;
  my $e_y    = $y_sum / $ct;
  my $e_x2   = $x2_sum / $ct;
  my $e_y2   = $y2_sum / $ct;
  my $e_xy   = $xy_sum / $ct;

  # dev_x = sqrt ( E[X^2] - E[X]**2 )
  my $dev_x  = sqrt($e_x2 - ($e_x**2));
  my $dev_y  = sqrt($e_y2 - ($e_y**2));

  my $cov = $e_xy - ($e_x * $e_y);
  my $cor = 1;

  # Is either attribute constant?
  if (($dev_x == 0) || ($dev_y == 0)) {
    if (($dev_x != 0) || ($dev_y != 0)) {
      # Just one.
      $cor = 0;
    }
  } else {
    $cor = $cov / ($dev_x * $dev_y);
  }

  if (wantarray()) {
    return ($cor, $cov);
  } else {
    return $cor;
  }
}

##################################################################
#
# arrEquiDepthMutual($$$) --
#
# Given references to two arrays and a number of equidepth [*]
# buckets to use on each axis, compute the mutual information
#
# H(x) + H(y) - H(x,y).
#
#
# [*] They'll be equidepth if there are no duplicates.  Hrm.
# But since there might well be duplicates, we have to compute
# the entropy anyway along those axes, since it might not be
# exactly 1.
#
# If called in LIST context, the second and third elements
# are H(x) and H(y), respectively.
#
sub arrEquiDepthMutual($$$) {
  my $arr_X_Ref = shift;
  my $arr_Y_Ref = shift;
  my $buckets   = shift;

  my $count     = undef;


  defined($buckets) || return undef;

  $count = scalar @{$arr_X_Ref};

  ((scalar @{$arr_Y_Ref}) == $count) || return undef;

  my $arr_X_Quantized_Ref = arrQuantize($arr_X_Ref);
  my $arr_Y_Quantized_Ref = arrQuantize($arr_Y_Ref);

  my ($mutinf, $H_X, $H_Y) = _arrMutualHelper($arr_X_Ref,
  $arr_Y_Ref,
  $buckets);

  if (wantarray()) {
    return ($mutinf, $H_X, $H_Y);
  } else {
    return $mutinf;
  }
}



# They're already quantized.  Compute H_XY.
sub _arrMutualHelper($$$) {
  my $arr_X_Quantized_Ref = shift;
  my $arr_Y_Quantized_Ref = shift;
  my $buckets             = shift;

  my $count               = scalar @$arr_X_Quantized_Ref;
  my $log2                = log(2);
  my $H_X       = undef;
  my $H_Y       = undef;
  my $H_XY      = undef;

  # Compute H_X
  compute_H_X:  {
    my @buckets_x = (0) x $buckets;
    my $idx = 0;


    for ($idx=0; $idx < $count; $idx++) {
      my $val        = $arr_X_Quantized_Ref->[$idx];
      my $bucket_idx = int($val * ($buckets-1));

      $buckets_x[$bucket_idx]++;
    }

    $H_X = 0;

    for ($idx=0; $idx < $buckets; $idx++) {
      my $p = $buckets_x[$idx] / $count;

      die unless ($p <= 1);
      if ($p > 0) {
        $H_X -= ($p * log($p));
      }
    }

    $H_X /= $log2;
  }

  # Compute H_Y
  compute_H_Y:  {
    my @buckets_y = (0) x $buckets;
    my $idx = 0;

    for ($idx=0; $idx < $count; $idx++) {
      my $val        = $arr_Y_Quantized_Ref->[$idx];
      my $bucket_idx = int($val * ($buckets-1));

      $buckets_y[$bucket_idx]++;
    }

    $H_Y = 0;

    for ($idx=0; $idx < $buckets; $idx++) {
      my $p = $buckets_y[$idx] / $count;

      die unless ($p <= 1);
      if ($p > 0) {
        $H_Y -= ($p * log($p));
      }
    }

    $H_Y /= $log2;
  }

  # Now, we need to compute H(X,Y).
  compute_H_XY: {
    my $idx = 0;
    my @buckets_refs = ();

    for ($idx=0; $idx < $buckets; $idx++) {
      $bucket_refs[$idx] = undef;
    }

    for ($idx=0; $idx < $count; $idx++) {
      my $val_X = $arr_X_Quantized_Ref->[$idx];
      my $val_Y = $arr_Y_Quantized_Ref->[$idx];

      my $bucket_x = int($val_X * ($buckets-1));
      my $bucket_y = int($val_Y * ($buckets-1));

      my $lref = $bucket_refs[$bucket_x];

      if (!defined($lref)) {
        my $subidx = 0;

        $bucket_refs[$bucket_x] = +[];

        $lref = $bucket_refs[$bucket_x];

        for ($subidx=0; $subidx < $buckets; $subidx++) {
          $lref->[$subidx] = 0;
        }
      }

      $lref->[$bucket_y] = $lref->[$bucket_y] + 1;
    }

    $H_XY = 0;

    my $tot = 0;
    for ($idx=0; $idx < $buckets; $idx++) {
      my $lref = $bucket_refs[$idx];

      if (defined($lref)) {
        foreach $val (@{$lref}) {
          defined($val) || die;
          if ($val > 0) {
            my $p = $val / $count;
            $tot += $val;
            die unless ($p <= 1);
            $H_XY -= ($p * log($p));
          }
        }
      }
    }

    ($tot == $count) || die;
    $H_XY /= $log2;
  }

  die unless (($H_X + $H_Y) > $H_XY);

  my $mutinf = $H_X + $H_Y - $H_XY;

  if (wantarray()) {
    return ($mutinf, $H_X, $H_Y);
  } else {
    return $mutinf;
  }
}



##################################################################
#
# arrEquiWidthMutual($$$) --
#
# Given references to two arrays and a number of equiwidth [*]
# buckets to use on each axis, compute the mutual information
#
# H(x) + H(y) - H(x,y).
#
#
# [*] Equiwidth along each individual axis, not between them.
# That is, intervals on axis X are equiwidth, and those on Y
# are equiwidth, but interval(X) != interval(Y) unless they
# have the same range.
#
# If called in LIST context, the second and third elements
# are H(x) and H(y), respectively.
#
sub arrEquiWidthMutual($$$) {
  my $arr_X_Ref = shift;
  my $arr_Y_Ref = shift;
  my $buckets   = shift;

  defined($buckets) || return undef;

  my $count = scalar @{$arr_X_Ref};
  ((scalar @{$arr_Y_Ref}) == $count) || return undef;

  my $min_x     = arrSelect($arr_X_Ref, 0);
  my $min_y     = arrSelect($arr_Y_Ref, 0);
  my $max_x     = arrSelect($arr_X_Ref, $count - 1);
  my $max_y     = arrSelect($arr_Y_Ref, $count - 1);

  my @qx        = @$arr_X_Ref;
  my @qy        = @$arr_Y_Ref;

  if ($min_x < $max_x) {
    map { $_ = (($_ - $min_x) / ($max_x - $min_x)) } @qx;
  } else {
    map { $_ = 0 } @qx;
  }

  if ($min_y < $max_y) {
    map { $_ = (($_ - $min_y) / ($max_y - $min_y)) } @qy;
  } else {
    map { $_ = 0 } @qy;
  }

  my ($mutinf, $H_X, $H_Y) = _arrMutualHelper(\@qx,
  \@qy,
  $buckets);
  if (wantarray()) {
    return ($mutinf, $H_X, $H_Y);
  } else {
    return $mutinf;
  }
}
