#!/usr/local/bin/perl5 -w
#
# Purpose:
#  To provide an Anderson-Darling goodness-of-fit test.  Note that
#  while one can usually generate the statistic (*), in order to
#  find out how significant it is, you need to consult a table
#  specific to that distribution.
#
#  (*) Exceptions:
#      1.  You don't have a valid estimated CDF, e.g. it's badly off
#          due to the limited precision of computers.
#      2.  The CDF returns an undefined or otherwise invalid value
#          for a data point.
#      3.  The CDF returns a value of 0 or 1, exactly.  While this is
#          certainly legal for many distributions -- e.g. in a
#          doubly truncated normal distribution, any data actually at
#          the truncation points gets those values -- the Anderson-
#          Darling statistic will be undefined because it'll compute
#          log(cdf) and log(1-cdf).
#
##########################################################################
# $Id: AndersonDarling.pm,v 1.4 2002/10/07 12:25:41 lw2j Exp $
# $Log:	AndersonDarling.pm,v $
# Revision 1.4  2002/10/07  12:25:41  lw2j
# Applied saner indenter script.
# 
# Revision 1.3  2002/09/10  17:29:30  lw2j
# Changed Anderson_Darling to AndersonDarling.
#
# Revision 1.2  2002/09/06  15:16:41  lw2j
# Now the AD_ functions also take and pass along the
# "already_sorted" flag.
#
# Revision 1.1  2002/09/05  18:00:18  lw2j
# Initial revision
#
##########################################################################
#
# Package Header
#
##########################################################################

package AndersonDarling;
require Exporter;

use strict;
use Symbol;

require Wrapper;
require UniRand;


##########################################################################

BEGIN {
  use Exporter ();
  use Symbol;
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

  $VERSION     = 1.0;
  @ISA         = qw(Exporter);
  @EXPORT      = ();
  %EXPORT_TAGS = ();
  @EXPORT_OK   = qw(
    AndersonDarling

    AD_normal
    AD_lognormal
    AD_exponential
  );

  return 1;
}


# These tables are written as list references containing
# <significance, critical value> pairs.
#
# This table is appropriate for normals and lognormals.
my $__NORMAL_TABLE =
+[
+[ 0.1,   0.631 ],
+[ 0.05,  0.752 ],
+[ 0.025, 0.873 ],
+[ 0.001, 1.035 ]
];


# Good for Weibull (including exponential) and gumbel.
my $__WEIBULL_TABLE =
+[
+[ 0.1,   0.637 ],
+[ 0.05,  0.757 ],
+[ 0.025, 0.877 ],
+[ 0.001, 1.038 ]
];




# Given:  two arguments.
#   #1.  Reference to the data, in the form of a univariate array.
#   #2.  Cumulative distribution function.
#
# Compute:
#   The Anderson-Darling A^{2} statistic, if computable.
#   The statistic is undefined if the CDF ever returns a value
#   other than something in (0,1).
#
# Returns:
#   The A^{2} statistic.  This, by itself, does not give you a
#   significance value -- for that, you need to use a distribution-
#   specific table.
#
# The data will be copied and sorted by this function, unless the
# third parameter ("sorted_already") is defined and non-zero.
sub AndersonDarling($$;$) {
  my $data_ref = shift;
  my $cdf      = shift;
  my $sorted_already = shift;

  defined($data_ref) || die;
  defined($cdf)      || die;

  (ref($data_ref) eq "ARRAY") || die;
  (ref($cdf)      eq "CODE")  || die;

  my $n        = scalar @$data_ref;
  my @w        = ();

  for (my $i=0; $i < $n; $i++) {
    my $w_i = &$cdf($data_ref->[$i]);

    # 0 and 1 are not allowed, because we take log($w) and
    # log(1-$w).

    if ((!defined($w_i)) || ($w_i <= 0) || ($w_i >= 1)) {
      return undef;
    }

    $w[$i] = $w_i;
  }

  # Sort the $w array, unless we're told that the data is
  # already in order (in which case the w_i's should also
  # be in order).  That's increasing order, BTW.
  if ((!defined($sorted_already)) || ($sorted_already == 0)) {
    @w = sort { $a <=> $b } @w;
  }

  my $sum = 0;

  for (my $i=0; $i < $n; $i++) {
    # Statistician's formulas usually give (2i-1), not (2i+1),
    # but their iteration is also from i=1..N, not 0..N-1.

    my $weight        = (2*$i) + 1;
    my $log_i         = log($w[$i]);

    # not n-i+1, for same reason:  array base of 0
    my $log_opposite  = log(1-$w[($n-1) - $i]);

    $sum += $weight * ($log_i + $log_opposite);
  }

  my $A2 = (-$n) - ($sum / $n);

  return $A2;
}




# Is the data normal?  You need to supply a reference to the data,
# but you can leave the CDF undefined (it'll be estimated by the
# means found in my UniRand package).
#
# This will return the significance value, if any.
# If wantarray(), it will also return the actual Anderson-Darling
# value.

sub AD_normal($;$$) {
  my $data_ref = shift;
  my $cdf      = shift;
  my $sorted_already = shift;

  if (!defined($cdf)) {
    my $cdf_index = $UniRand::EST_INDICES{"cdf"};
    my @results = UniRand::normal_est($data_ref);

    $cdf=$results[$cdf_index];

    defined($cdf) || return undef;
  }

  my $A2 = AndersonDarling($data_ref, $cdf, $sorted_already);

  if (!defined($A2)) {
    return (wantarray() ? (undef, undef) : undef);
  }

  my $best_sig = undef;

  # Table global, not embedded here, because it's shared by the
  # lognormal.

  foreach (@$__NORMAL_TABLE) {
    my ($sig, $thresh) = @{$_};

    if (((!defined($best_sig)) || ($best_sig < $sig)) &&
    ($A2 <= $thresh)) {
      $best_sig = $sig;
    }
  }

  if (wantarray()) {
    return ($best_sig, $A2);
  } else {
    return $best_sig;
  }
}




# Is the data lognormal?  You need to supply a reference to the data,
# but you can leave the CDF undefined (it'll be estimated by the
# means found in my UniRand package).
#
# This will return the significance value, if any.
# If wantarray(), it will also return the actual Anderson-Darling
# value.

sub AD_lognormal($;$$) {
  my $data_ref = shift;
  my $cdf      = shift;
  my $sorted_already = shift;

  if (!defined($cdf)) {
    my $cdf_index = $UniRand::EST_INDICES{"cdf"};
    my @results = UniRand::lognormal_est($data_ref);

    $cdf=$results[$cdf_index];

    defined($cdf) || return undef;
  }

  my $A2 = AndersonDarling($data_ref, $cdf, $sorted_already);

  if (!defined($A2)) {
    return (wantarray() ? (undef, undef) : undef);
  }

  my $best_sig = undef;

  # Table global, not embedded here, because it's shared by the
  # normal.

  foreach (@$__NORMAL_TABLE) {
    my ($sig, $thresh) = @{$_};

    if (((!defined($best_sig)) || ($best_sig < $sig)) &&
    ($A2 <= $thresh)) {
      $best_sig = $sig;
    }
  }

  if (wantarray()) {
    return ($best_sig, $A2);
  } else {
    return $best_sig;
  }
}



# Is the data exponential?  You need to supply a reference to the data,
# but you can leave the CDF undefined (it'll be estimated by the
# means found in my UniRand package).
#
# This will return the significance value, if any.
# If wantarray(), it will also return the actual Anderson-Darling
# value.

sub AD_exponential($;$$) {
  my $data_ref = shift;
  my $cdf      = shift;
  my $sorted_already = shift;

  if (!defined($cdf)) {
    my $cdf_index = $UniRand::EST_INDICES{"cdf"};
    my @results = UniRand::exponential_est($data_ref);

    $cdf=$results[$cdf_index];

    defined($cdf) || return undef;
  }

  my $A2 = AndersonDarling($data_ref, $cdf, $sorted_already);

  if (!defined($A2)) {
    return (wantarray() ? (undef, undef) : undef);
  }

  my $best_sig = undef;

  # Table global, not embedded here, in case I add other members of
  # the family.

  foreach (@$__WEIBULL_TABLE) {
    my ($sig, $thresh) = @{$_};

    if (((!defined($best_sig)) || ($best_sig < $sig)) &&
    ($A2 <= $thresh)) {
      $best_sig = $sig;
    }
  }

  if (wantarray()) {
    return ($best_sig, $A2);
  } else {
    return $best_sig;
  }
}



return 1;
