#!/usr/local/bin/perl5 -w
#
# Purpose:
#  To provide an Kolmogorov-Smirnov goodness-of-fit test.  Note that
#  while one can usually generate the statistic (*), in order to
#  find out how significant it is, you need to consult a table
#  specific to that distribution.
#
#  (*) Exceptions:
#      1.  It's not a continuous distribution.
#      2.  You estimated the distribution from the data, in which
#          case while you can compute the statistic, it's not
#          valid.
#
##########################################################################
# $Id: KolmogorovSmirnov.pm,v 1.1 2004/08/19 18:48:29 lw2j Exp $
# $Log:	KolmogorovSmirnov.pm,v $
# Revision 1.1  2004/08/19  18:48:29  lw2j
# Initial revision
# 
#
##########################################################################
#
# Package Header
#
##########################################################################

package KolmogorovSmirnov;
require Exporter;
require Symbol;

use strict;

require Numerical;

##########################################################################

BEGIN {
  use Exporter ();
  use Symbol;
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

  $VERSION     = 1.0;
  @ISA         = qw(Exporter);
  @EXPORT      = ();
  %EXPORT_TAGS = ();
  @EXPORT_OK   = qw(
    Kolmogorov_Smirnov

    compute_probability
    compute_critical

    lookup_critical
    lookup_probability
  );

  return 1;
}



# If you like, you can use a precomputed table that gives z values
# that go along with probabilities from 0.0001 to 1.0000 at 0.0001
# increments.  It'll load the table if you use lookup_critical or
# lookup_probability.
my $__KS_TABLE_FN = "table.Kolmogorov_Smirnov";
my $__KS_TABLE = undef;


sub _load_table();




# Given:  two arguments.
#   #1.  Reference to the data, in the form of a univariate array.
#   #2.  Cumulative distribution function.
#
# Compute:
#   The Kolmogorov-Smirnov test statistic.
#   Specifically, the one-sided version Dn+.
#
# Returns:
#   The raw K-S statistic.  You still need to use a table to
#   judge significance.  Failure, as CDF returning undef to
#   indicate an impossible value, results in undef.
#
# The data will be copied and sorted by this function, unless the
# third parameter ("sorted_already") is defined and non-zero.
sub Kolmogorov_Smirnov($$;$) {
  my $data_ref = shift;
  my $cdf      = shift;
  my $sorted_already = shift;

  defined($data_ref) || die;
  defined($cdf)      || die;

  (ref($data_ref) eq "ARRAY") || die;
  (ref($cdf)      eq "CODE")  || die;

  my $n        = scalar @$data_ref;
  my @w        = ();


  if (!defined($sorted_already)) {
    $sorted_already = 1;

    loop: for (my $i=1; $i < $n; $i++) {
      if ($data_ref->[$i] < $data_ref->[$i-1]) {
        $sorted_already = 0;
        last loop;
      }
    }
  }

  for (my $i=0; $i < $n; $i++) {
    my $w_i = &$cdf($data_ref->[$i]);

    if (!defined($w_i)) {
      return undef;
    }

    $w[$i] = $w_i;
  }

  # Sort the $w array, unless we're told that the data is
  # already in order (in which case the w_i's should also
  # be in order).  That's increasing order, BTW.
  if (!$sorted_already) {
    @w = sort { $a <=> $b } @w;
  }

  my $KS = 0;

  for (my $i=0; $i < $n; $i++) {
    my $frac_expected = ($i+1)/$n;
    my $cdf_val       = &$cdf($w[$i]);

    if (!defined($cdf_val)) {
      # Hm, impossible.
      return undef;
    } else {
      my $D = abs($cdf_val - $frac_expected);

      if ($D > $KS) {
        $KS = $D;
      }
    }
  }

  return $KS;
}



# Given a Dn value (the K-S raw statistic) and the data cardinality
# ($n), and if you like an error tolerance (absolute numbers; it's
# a convergence thang) it'll attempt to compute the probability
# that, if the data DID follow the CDF you thought it did, it would
# diverge by at least that much.
#
# If you don't give it $n, it'll assume 1.  This is not really
# useful except for computing critical values.
#
# Return it, or undef on assorted failures.
sub compute_probability($$;$) {
  my $Dn = shift;
  my $n  = (scalar(@_)) ?     shift : 1;
  my $errtol = (scalar(@_)) ? shift : 1e-9;

  if ((!defined($Dn)) || (!defined($n)) || ($n <= 0)) {
    return undef;
  }

  if ($Dn <= 0) {
    return 1;
  }

  my $prob = 0;
  my $r    = 1;
  my $z    = $Dn / sqrt($n);

  loop: {
    my $delta   = ($r % 2) ? 2 : -2;
    my $e_power = -2 * ($r * $r) * ($z * $z);

    $delta *= exp($e_power);
    $prob += $delta;

    if (abs($delta) >= $errtol) {
      $r++;
      redo loop;
    }
  }

  return $prob;
}



# Compute critical values for a given probability.
# These values are $z values unless $n is provided, in which
# case we've already multiplied by sqrt($n).
#
# Providing $n will allow a much tighter upper bound.  If
# you don't provide it and you're working with a huge $n,
# this function might fail entirely.
sub compute_critical($;$) {
  my $prob    = shift;
  my $n       = scalar(@_) ? shift : undef;
  my $n_bound = defined($n) ? $n : 1e12;

  defined($prob) || die;

  if ($prob <= 0) {
    # Infinite, but we can't represent it.
    return undef;
  }

  if ($prob >= 1) {
    return 0;
  }

  # Otherwise...
  my $lo_thresh = 0;
  my $hi_thresh = sqrt($n_bound);

  my $thresh = Numerical::inverse_search(\&compute_probability,
  $prob, $lo_thresh, $hi_thresh);

  if (defined($n)) {
    $thresh *= sqrt($n);
  }

  return $thresh;
}



# Given a probability, find the matching critical value from the table.
# It'll load the table if it hasn't done that yet.
#
# The table will eat memory and obviously doesn't give infinite
# precision, but will likely save computational time.
#
# If you give it a value of $n, it'll return the actual critical
# value (z * sqrt($n)).  In most cases this won't affect correctness
# except that if you don't give it $n and ask for the critical value
# for a probability of 0.
#
# If the prob is not in the table (which would be fairly odd unless
# you have very specific tastes about probabilities), then compute
# it.
sub lookup_critical($;$n) {
  my $prob = shift;
  my $n    = scalar(@_) ? shift : undef;

  if (!defined($prob)) {
    return undef;
  }

  if ($prob <= 0) {
    # Infinite.  Well, actually sqrt($n); the maximum $Dn value
    # is no greater than $n (that's the maximum difference
    # between a CDF value and the expected fraction), so the
    # maximum $z <= $Dn / $n^{-1/2} is sqrt($n).
    return (defined($n) ? sqrt($n) : undef);
  }

  if (!defined($__KS_TABLE)) {
    _load_table();
  }


  defined($__KS_TABLE) || die;
  my $table_ct = scalar (@$__KS_TABLE);
  my $lo       = 0;
  my $hi       = $table_ct - 1;

  # Binary search within table.  Look for exact match.
  # Well, sort of exact; this is FP math after all.
  loop:  {
    my $mid      = int(($lo+$hi)/2);
    my $mid_val  = $__KS_TABLE->[$mid]->[0];

    if (abs($mid_val - $prob) <= 1e-9) {
      # Should be reasonably close to equality.

      my $z = $__KS_TABLE->[$mid]->[1];

      return (defined($n) ? ($z * sqrt($n)) : $z);
    } elsif ($mid_val < $prob) {
      $lo = $mid+1;
    } elsif ($mid_val > $prob) {
      $hi = $mid-1;
    }

    if ($lo > $hi) {
      last loop;
    } else {
      redo loop;
    }
  }

  # No exact match.
  my $crit = compute_critical($prob, $n);

  defined($crit) || die;
  return $crit;
}




# Given a deviation, it'll attempt to figure out the probability
# using the K-S table, returning the closest match.
#
# If you give it an $n value, it'll assume that the $deviation
# is a $Dn value (raw K-S statistic) and compute $z.  Otherwise,
# it'll assume that the deviation is already a $z value.
#
# Unless you're incredibly lucky the deviation is probably
# not in the table...

sub lookup_probability($;$n) {
  my $deviation = shift;
  my $n    = scalar(@_) ? shift : undef;
  my $z    = (defined($n) && ($n > 0)) ? ($deviation / sqrt($n)) :
  $deviation;

  if (defined($n) && ($n <= 0)) {
    return undef;
  }

  if (!defined($z)) {
    return undef;
  }

  if ($z <= 0) {
    # Automatic.
    return 1;
  } elsif (defined($n) && ($z > sqrt($n))) {
    # Impossible.
    return 0;
  }


  if (!defined($__KS_TABLE)) {
    _load_table();
  }


  defined($__KS_TABLE) || die;
  my $table_ct = scalar (@$__KS_TABLE);
  my $lo       = 0;
  my $hi       = $table_ct - 1;

  # Binary search within table.  Look for exact match.
  # Well, sort of exact; this is FP math after all.
  loop:  {
    my $mid      = int(($lo+$hi)/2);
    my $mid_val  = $__KS_TABLE->[$mid]->[0];

    if (abs($mid_val - $z) <= 1e-9) {
      # Should be reasonably close to equality.

      my $prob = $__KS_TABLE->[$mid]->[0];

      return $prob;
    } elsif ($mid_val < $z) {
      $lo = $mid+1;
    } elsif ($mid_val > $z) {
      $hi = $mid;
    }

    if ($lo >= $hi) {
      last loop;
    } else {
      redo loop;
    }
  }

  # Logic error?
  die if ($lo > $hi);

  my $prob = $__KS_TABLE->[$lo]->[0];
  return $prob;
}






# Load the $__KS_TABLE_FN file if it's not already loaded.
# Table is assumed to have two numbers per line, a probability
# and a z value, separated by whitespace.  Lines must be in
# increasing order of probabilities.
sub _load_table() {
  if (defined($__KS_TABLE)) {
    return;
  }

  if (!(-f ($__KS_TABLE_FN))) {
    die "Cannot locate $__KS_TABLE_FN";
  }

  my $fh = gensym();

  open($fh, $__KS_TABLE_FN) || die "Failed to open(r) '$__KS_TABLE_FN':  $!";
  $__KS_TABLE = +[];

  loop:  {
    my $line = <$fh>;

    if (!defined($line)) {
      last loop;
    }

    $line =~ s/^\s+//g;
    $line =~ s/\s+$//g;
    ($line =~ /\S/) || last loop;

    my @tuple = split /\s+/, $line;

    push @$__KS_TABLE, \@tuple;
    redo loop;
  }

  close($fh) || die "Failed to close(r) '$__KS_TABLE_FN':  $!";
}

return 1;
