#!/usr/local/bin/perl5 -w
#
# Purpose:
#
# To compute Chi^{2} values, for judging whether or not a univariate
# distribution (specified by its CDF) is a good fit to data.
#
# It can use a separate chi-square table, but it now has a built-in
# one.  This eliminates the need for symlinking/copying
# table.chi_square everywhere you use this...
#
##########################################################################
#
# Package Header
#
##########################################################################

package ChiSquare;
require Exporter;

use strict;
use Symbol;

require Wrapper;

##########################################################################

BEGIN {
  use Exporter ();
  use Symbol;
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

  $VERSION     = 1.0;
  @ISA         = qw(Exporter);
  @EXPORT      = ();
  %EXPORT_TAGS = ();
  @EXPORT_OK   = qw(
    load_fh
    chi_square
  );

  return 1;
}


#####################################################################
# Embedding table.  Supply your own via filename if desired.
#####################################################################

# See 'perlfaq7.pod'
sub _fix {
  local $_ = shift;
  my ($white, $leader);  # common whitespace and common lea  ding string
  if (/^\s*(?:([^\w\s]+)(\s*).*\n)(?:\s*\1\2?.*\n)+$/) {
    ($white, $leader) = ($2, quotemeta($1));
  } else {
    ($white, $leader) = (/^(\s+)/, '');
  }
  s/^\s*?$leader(?:$white)?//gm;
  return $_;
}



# Below, the @@@@ prefix is only used for clarity.  If you put this
# in a separate file, you should NOT have them.  The parser will,
# however,

my $__BUILTIN_TABLE = _fix<<END_OF_TABLE;
@@@@   ;;; The format is simple.  The first vector must have a placeholder,
@@@@   ;;; followed by each significance level (out of 1; 5% => 0.05).  Each
@@@@   ;;; subsequent vector consists of a K value and thresholds for that
@@@@   ;;; given K and each of the significance levels, in order.
@@@@   ;;;
@@@@   ;;; There may be no missing values, but the K values need not be
@@@@   ;;; in order, nor need they be complete (1..K).
@@@@
@@@@   -1, 0.05, 0.01, 0.001
@@@@   1, 3.84, 6.64, 10.83
@@@@   2, 5.99, 9.21, 13.82
@@@@   3, 7.82, 11.35, 16.27
@@@@   4, 9.49, 13.28, 18.47
@@@@   5, 11.07, 15.09, 20.52
@@@@   6, 12.59, 16.81, 22.46
@@@@   7, 14.07, 18.48, 24.32
@@@@   8, 15.51, 20.09, 26.13
@@@@   9, 16.92, 21.67, 27.88
@@@@   10, 18.31, 23.21, 29.59
@@@@   11, 19.68, 24.73, 31.26
@@@@   12, 21.03, 26.22, 32.91
@@@@   13, 22.36, 27.69, 34.53
@@@@   14, 23.69, 29.14, 36.12
@@@@   15, 25.00, 30.58, 37.70
@@@@   16, 26.30, 32.00, 39.25
@@@@   17, 27.59, 33.41, 40.79
@@@@   18, 28.87, 34.81, 42.31
@@@@   19, 30.14, 36.19, 43.82
@@@@   20, 31.41, 37.57, 45.32
@@@@   21, 32.67, 38.93, 46.80
@@@@   22, 33.92, 40.29, 48.27
@@@@   23, 35.17, 41.64, 49.73
@@@@   24, 36.42, 42.98, 51.18
@@@@   25, 37.65, 44.31, 52.62
@@@@   26, 38.89, 45.64, 54.05
@@@@   27, 40.11, 46.96, 55.48
@@@@   28, 41.34, 48.28, 56.89
@@@@   29, 42.56, 49.59, 58.30
@@@@   30, 43.77, 50.89, 59.70
@@@@   31, 44.99, 52.19, 61.10
@@@@   32, 46.19, 53.49, 62.49
@@@@   33, 47.40, 54.78, 63.87
@@@@   34, 48.60, 56.06, 65.25
@@@@   35, 49.80, 57.34, 66.62
@@@@   36, 51.00, 58.62, 67.99
@@@@   37, 52.19, 59.89, 69.35
@@@@   38, 53.38, 61.16, 70.71
@@@@   39, 54.57, 62.43, 72.06
@@@@   40, 55.76, 63.69, 73.41
@@@@   41, 56.94, 64.95, 74.75
@@@@   42, 58.12, 66.21, 76.09
@@@@   43, 59.30, 67.46, 77.42
@@@@   44, 60.48, 68.71, 78.75
@@@@   45, 61.66, 69.96, 80.08
@@@@   46, 62.83, 71.20, 81.40
@@@@   47, 64.00, 72.44, 82.72
@@@@   48, 65.17, 73.68, 84.03
@@@@   49, 66.34, 74.92, 85.35
@@@@   50, 67.51, 76.15, 86.66
@@@@   51, 68.67, 77.39, 87.97
@@@@   52, 69.83, 78.62, 89.27
@@@@   53, 70.99, 79.84, 90.57
@@@@   54, 72.15, 81.07, 91.88
@@@@   55, 73.31, 82.29, 93.17
@@@@   56, 74.47, 83.52, 94.47
@@@@   57, 75.62, 84.73, 95.75
@@@@   58, 76.78, 85.95, 97.03
@@@@   59, 77.93, 87.17, 98.34
@@@@   60, 79.08, 88.38, 99.62
@@@@   61, 80.23, 89.59, 100.88
@@@@   62, 81.38, 90.80, 102.15
@@@@   63, 82.53, 92.01, 103.46
@@@@   64, 83.68, 93.22, 104.72
@@@@   65, 84.82, 94.42, 105.97
@@@@   66, 85.97, 95.63, 107.26
@@@@   67, 87.11, 96.83, 108.54
@@@@   68, 88.25, 98.03, 109.79
@@@@   69, 89.39, 99.23, 111.06
@@@@   70, 90.53, 100.42, 112.31
@@@@   71, 91.67, 101.62, 113.56
@@@@   72, 92.81, 102.82, 114.84
@@@@   73, 93.95, 104.01, 116.08
@@@@   74, 95.08, 105.20, 117.35
@@@@   75, 96.22, 106.39, 118.60
@@@@   76, 97.35, 107.58, 119.85
@@@@   77, 98.49, 108.77, 121.11
@@@@   78, 99.62, 109.96, 122.36
@@@@   79, 100.75, 111.15, 123.60
@@@@   80, 101.88, 112.33, 124.84
@@@@   81, 103.01, 113.51, 126.09
@@@@   82, 104.14, 114.70, 127.33
@@@@   83, 105.27, 115.88, 128.57
@@@@   84, 106.40, 117.06, 129.80
@@@@   85, 107.52, 118.24, 131.04
@@@@   86, 108.65, 119.41, 132.28
@@@@   87, 109.77, 120.59, 133.51
@@@@   88, 110.90, 121.77, 134.74
@@@@   89, 112.02, 122.94, 135.96
@@@@   90, 113.15, 124.12, 137.19
@@@@   91, 114.27, 125.29, 138.45
@@@@   92, 115.39, 126.46, 139.66
@@@@   93, 116.51, 127.63, 140.90
@@@@   94, 117.63, 128.80, 142.12
@@@@   95, 118.75, 129.97, 143.32
@@@@   96, 119.87, 131.14, 144.55
@@@@   97, 120.99, 132.31, 145.78
@@@@   98, 122.11, 133.47, 146.99
@@@@   99, 123.23, 134.64, 148.21
@@@@   100, 124.34, 135.81, 149.48
END_OF_TABLE

sub new($@) {
  my $class    = shift;
  my $table_fn = shift;
  my $self     = +{};

  bless $self, $class;

  # The structure is indexed by K.  Each element will be a list of
  # <significance, threshold> pairs.

  $self->{'intervals'}  = +[];
  $self->{'siglevels'}  = +[];
  $self->{'thresholds'} = +{};

  my $table_fh = gensym();

  if (defined($table_fn)) {
    open($table_fh, $table_fn) || die "Failed to open(r) '$table_fn'";
    $self->load_fh($table_fh);

    close($table_fh) || die "Failed to close(r) '$table_fn'";
  } else {
    my @lines = split /\n/, $__BUILTIN_TABLE;

    map {
      $_ =~ s/\;.*$//g;
      $_ =~ tr/,/ /;
      $_ =~ s/^\s+//;
      $_ =~ s/\s+$//;
      $_ =~ s/\s+/ /g;
    } @lines;

    @lines = grep { /\S/ } @lines;

    my $sig_line = shift @lines;
    my $sig_ref  = +[ split /\s/, $sig_line ];

    shift @$sig_ref;
    my $sig_ct   = scalar @$sig_ref;

    $self->{'siglevels'} = $sig_ref;

    foreach (@lines) {
      my @items = split /\s/, $_;
      my $interval = shift @items;

      push @{$self->{'intervals'}}, $interval;
      $self->{'thresholds'}->{$interval} = +[];

      die unless ($sig_ct == (scalar @items));

      my $i=0;

      for ($i=0; $i < $sig_ct; $i++) {
        my $sig    = $sig_ref->[$i];
        my $thresh = $items[$i];

        push @{$self->{'thresholds'}->{$interval}}, +[ $sig, $thresh ];
      }
    }
  }


  return $self;
};



# Load the threshold table.
sub load_fh($$) {
  my $self = shift;
  my $fh   = shift;

  $self->{'intervals'}  = +[];
  $self->{'siglevels'}  = +[];
  $self->{'thresholds'} = +{};

  # Read it.  We use Wrapper just for the parsing.  *shrug*
  my $wrapper = new Wrapper($fh);

  my $count = $wrapper->get_count();
  my $dims  = $wrapper->get_dims();

  # The first line contains:
  # -1, sig level [sig level [sig level...]]
  my @arr   = $wrapper->get_obj(0);
  shift(@arr);

  $self->{'siglevels'} = +[ @arr ];

  # Subsequent lines contain:
  # K, thresh, thresh, thresh...
  my $i=1;
  my $sigs = scalar @{$self->{'siglevels'}};

  for ($i=1; $i < $count; $i++) {
    my @obj = $wrapper->get_obj($i);
    my $K   = shift @obj;
    my $j   = 0;

    push @{$self->{'intervals'}}, $K;
    $self->{'thresholds'}->{$K} = +[];

    for ($j=0; $j < $sigs; $j++) {
      my $sig    = $self->{'siglevels'}->[$j];
      my $thresh = $obj[$j];

      push @{$self->{'thresholds'}->{$K}}, +[ $sig, $thresh];
    }
  }

  return $self;
}




# The main function.  This needs to accept the array reference, for
# the data; the number of buckets; a reference to the distro CDF with
# which we compare.  We'll equidepth bucketize based on the CDF.
# The last parameter, $estimated, is the number of parameters that
# were estimated from the data.
#
# We'll return:
#   The strictest significance level for which it passes, if any
#   (undefined, if it passes none)
#
# The $cdf_sub subroutine should accept raw data items and return a
# number in [0,1] reflecting the percentile.  It does not actually
# matter which end is which.

sub chi_square($$$$$) {
  my $self      = shift;
  my $arr_ref   = shift;
  my $numbuck   = shift;
  my $cdf_sub   = shift;
  my $estimated = shift;

  my @buckets = (0) x $numbuck;

  my $ct = scalar @$arr_ref;
  my $i  = 0;

  # Bucketize.  We divide based on where the CDF would predict
  # equidepth divisions, so that we only need one pass.  *shrug*
  for ($i=0; $i < $ct; $i++) {
    my $x   = $arr_ref->[$i];
    my $pct = $cdf_sub->($x);

    defined($pct) || return undef;
    my $box = int($pct * $numbuck);

    $buckets[$box]++;
  }

  # Compute deviations.
  my $expected = $ct / $numbuck;
  my $D2       = 0;

  for ($i=0; $i < $numbuck; $i++) {
    my $dev = $buckets[$i] - $expected;

    $D2 += ($dev * $dev) / $expected;
  }

  # Check thresholds.
  my $best_sig = undef;
  my $ref      = undef;
  my $K = $numbuck - 1 - $estimated;

  if (!exists($self->{'thresholds'}->{$K})) {
    return undef;
  }

  foreach $ref (@{$self->{'thresholds'}->{$K}}) {
    my ($sig, $thresh) = @{$ref};

    # Higher sig => stricter => better, if we pass
    if ((!defined($best_sig)) || ($best_sig < $sig)) {
      if ($D2 < $thresh) {
        $best_sig = $sig;
      }
    }
  }

  return $best_sig;
}


return 1;
