#!/usr/local/bin/perl5 -w
#
# Purpose:
#
# To provide for the use of univariate transforms not tied to specific
# random distributions.
#
# The functions should accept a reference to the array, and return:
#   - a reference to a transformation that maps to 0..1
#   - a reference to a list of estimated parameters
#   - a reference to an inverse transformation
#
#
##########################################################################
# $Id: Transform.pm,v 1.6 2002/08/16 16:40:51 lw2j Exp $
# $Log:	Transform.pm,v $
# Revision 1.6  2002/08/16  16:40:51  lw2j
# Log now included.
# $__ABOVE_ZERO instead of $above_zero, for consistency.
#
# Box_Cox, logarithmic now get PDF-type functions which should
# allow for compatible ll/BIC/AIC computations.  Box_Cox also
# supports lambda=0.
#
##########################################################################
#
# Package Header
#
##########################################################################

package Transform;
use strict;

require Exporter;
require ChiSquare;
require Numerical;
require ArrayFoo;

##########################################################################

BEGIN {
  use Exporter ();
  use Symbol;
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

  $VERSION     = 1.0;
  @ISA         = qw(Exporter);
  @EXPORT      = ();
  %EXPORT_TAGS = ();
  @EXPORT_OK   = qw(
    @XF_LIST

    logarithmic_est
    logarithmic_gen_xf
    logarithmic_gen_inverse
    logarithmic_gen_pdf

    Box_Cox_est
    Box_Cox_gen_xf
    Box_Cox_gen_inverse
    Box_Cox_gen_pdf

    polynomial_est
    polynomial_gen_xf
    polynomial_gen_inverse

    interpolator_est
    interpolator_gen_xf
    interpolator_gen_inverse

    best_match
  );


  return 1;
}


# Arbitrary.  When data must be shifted so it's all positive for
# picky functions, like log, this is treated as a minimum.
my $__ABOVE_ZERO = 1e-6;


our @XF_LIST    =
(
  +[ 'logarithmic', \&logarithmic_est,
  +[ 'preshift', 'postshift', 'factor' ],
  \&logarithmic_gen_xf,
  \&logarithmic_gen_inverse,
  \&logarithmic_gen_pdf],
  +[ 'Box_Cox', \&Box_Cox_est,
  +[ 'lambda', 'preshift', 'postshift', 'scale' ],
  \&box_cox_gen_xf,
  \&box_cox_gen_inverse,
  \&box_cox_gen_pdf]
);


sub new($@) {
  my $class = shift;
  my $self  = +{};

  bless $self, $class;
  return $self;
}



# Apply a logarithmic transformation.  We only shift before if necessary,
# but we scale afterwards to 0..1.
#
sub logarithmic_est($) {
  my $data_ref = shift @_;
  my $ct       = scalar @$data_ref;
  my $min      = ArrayFoo::arrSelect($data_ref, 0);
  my $max      = ArrayFoo::arrSelect($data_ref, $ct-1);

  # $preshift is added before.
  my $preshift   = ($min >= $__ABOVE_ZERO) ? 0 : ($__ABOVE_ZERO - $min);

  # $postshift is added afterwards.
  my $postshift  = -log($min+$preshift);

  # The range of raw logs.
  my $range    = log($max + $preshift) + $postshift;

  # Afterwards, multiply by $factor.
  my $factor   = 1;

  if ($range > 0) {
    $factor = 1 / $range;
  } else {
    $factor = -1 / $range;
  }

  my $param_ref      = +[ $preshift, $postshift, $factor ];
  my $xf_sub         = logarithmic_gen_xf($param_ref);
  my $inverse_sub    = logarithmic_gen_inverse($param_ref);
  my $pdf_sub        = logarithmic_gen_pdf($param_ref);
  return ($xf_sub, $param_ref, $inverse_sub, $pdf_sub);
}



sub logarithmic_gen_xf($) {
  my $param_ref = shift @_;
  my $preshift  = $param_ref->[0];
  my $postshift = $param_ref->[1];
  my $factor    = $param_ref->[2];

  return sub {
    my $x = shift @_;
    my $y = undef;

    $x += $preshift;
    if ($x <= 0) {
      return 0;
    }

    $y = $factor * ($postshift + log($x));

    if ($y > 1) {
      return 1;
    }

    return $y;
  };
}


sub logarithmic_gen_inverse($) {
  my $param_ref = shift @_;
  my $preshift  = $param_ref->[0];
  my $postshift = $param_ref->[1];
  my $factor    = $param_ref->[2];

  return sub {
    my $y = shift @_;
    my $x = undef;

    die unless (($y >= 0) && ($y <= 1));

    return (exp(($y / $factor) - $postshift) - $preshift);
  };
}



# Any nondecreasing continuous function that maps from a subset
# of R to [0,1] can be treated as a CDF, and then differentiated
# to make a fake PDF.
#
#    y = F * (ln(X + S1) + S2)
#
# S1 = preshift, S2 = postshift, F=factor
#
# Range:
#    X > -S1
#    X <= e^{1-S2}-S1
sub logarithmic_gen_pdf($) {
  my $param_ref = shift @_;
  my $preshift  = $param_ref->[0];
  my $postshift = $param_ref->[1];
  my $factor    = $param_ref->[2];

  my $min_x     = -$preshift;
  my $max_x     = exp(1-$postshift) - $preshift;

  return sub {
    my $x = shift @_;

    if (($x <= $min_x) || ($x > $max_x)) {
      return 0;
    }

    return ($factor / ($x + $preshift));
  };
}


# A Box-Cox transformation with parameter 'lambda' is defined as
#
# y(x) = ln(x)          if \lambda=0
# y(x) = x^{\lambda}    otherwise
#
# where \lambda can be anything.
#
# Aye, there's the rub:  estimating lambda.  Box-Cox is normally
# used for regression, AFAICT.  There, one can estimate lambda
# through the shotgun approach:  try multiple values and select
# the one that results in the highest-magnitude correlation
# between the dependent variable (being xf'd) and the independent
# variable.
#
# Here, we don't have that -- all we have is a set.  It's probably
# not even ordered; e.g. instances in a data set may be in random
# order instead of chronological.
#
# The approach I'll take:
#    1.  Sort the data in increasing order.
#    2.  Shift the data so it's all above_zero.
#    3.  Use the Simplex method to minimize the SSE error.
#
#
# We constrain lambda so that it's positive; there's a bit of a
# discontinuity at 0 that makes dealing with it more trouble than
# it's worth given that we already have a specialized function
# for the logarithm.
#
# Change:  toss 0 back in.

sub Box_Cox_est($) {
  my $data_ref = shift;
  my $ct       = scalar @$data_ref;
  my $min      = ArrayFoo::arrSelect($data_ref, 0);
  my $max      = ArrayFoo::arrSelect($data_ref, $ct-1);
  my $preshift = $__ABOVE_ZERO-$min;

  $min += $preshift;
  $max += $preshift;

  my @data     = sort { $a <=> $b } @$data_ref;

  map { $_ += $preshift } @data;

  my $SSE_sub = sub {
    my $lambda = shift;
    my $i      = 0;
    my $sum    = 0;

    if ($lambda < 0) {
      # Absurdly high, to discourage this value.
      return (1e50);
    }

    my $min_gen = ($lambda > 0) ? ($min ** $lambda) : log($min);
    my $max_gen = ($lambda > 0) ? ($max ** $lambda) : log($max);
    my $range   = $max_gen - $min_gen;

    foreach (@data) {
      my $datum = $_;
      my $frac  = $i / $ct;
      my $x     = ($lambda > 0) ?
      ((($datum ** $lambda) - $min_gen) / $range) :
      (log($datum) - $min_gen) / $range;

      my $err = $x - $frac;

      $i++;
      $sum += ($err * $err);
    }

    # Minimize this.
    return $sum;

  };

  my @first_simplex  = Numerical::simplex_wrap(+[2], 1);
  my ($P_ref, $yval) = Numerical::Nelder_Mead_simplex($SSE_sub,
  \@first_simplex);
  my $lambda    = $P_ref->[0];
  my $postshift = ($lambda > 0) ? -($min ** $lambda) : (log($min));
  my $max_final = ($lambda > 0) ? ($max ** $lambda) : (log($max));
  my $scale     = 1 / ($max_final + $postshift);
  my $param_ref = +[ $lambda, $preshift, $postshift, $scale ];

  my $xf_sub         = Box_Cox_gen_xf($param_ref);
  my $inverse_sub    = Box_Cox_gen_inverse($param_ref);
  my $pdf_sub        = Box_Cox_gen_pdf($param_ref);

  return ($xf_sub, $param_ref, $inverse_sub, $pdf_sub);
}


# Box-Cox transformation.
sub Box_Cox_gen_xf($) {
  my $param_ref = shift;
  my ($lambda, $preshift, $postshift, $scale) = @$param_ref;

  # A dodge.  Using '$__ABOVE_ZERO' in the subroutine results in
  # an undefined; it's probably not evaluated until the subroutine
  # is, and it won't be able to refer to the 'my $__ABOVE_ZERO = 1e-6'
  # definition earlier.

  my $az        = $__ABOVE_ZERO;

  if ($lambda < 0) {
    return undef;
  }

  return sub {
    my $x = shift @_;
    $x += $preshift;
    if ($x < $az) {
      $x = $az;
    }
    my $y = ($lambda > 0) ? ($x ** $lambda) : log($x);
    $y = ($y + $postshift) * $scale;
    $y = ($y < 1) ? $y : 1;

    return $y;
  };
}


# Box-Cox transformation.
sub Box_Cox_gen_inverse($) {
  my $param_ref = shift;
  my ($lambda, $preshift, $postshift, $scale) = @$param_ref;

  if ($lambda < 0) {
    return undef;
  }

  return sub {
    my $y = shift @_;

    if (($y < 0) || ($y > 1)) {
      return undef;
    }

    my $x = ($lambda > 0) ?
    (((($y / $scale) - $postshift) ** (1/$lambda)) - $preshift) :
    (exp(($y / $scale) - $postshift) - $preshift);

    return $x;
  };
}


# Box-Cox pseudo-PDF.
sub Box_Cox_gen_pdf($) {
  my $param_ref = shift;
  my ($lambda, $preshift, $postshift, $scale) = @$param_ref;


  if ($lambda < 0) {
    return undef;
  }

  my $max = ($lambda > 0) ?
  ((((1-$postshift)/$scale)**(1/$lambda)) - $preshift) :
  (exp((1-$postshift)/$scale) - $preshift);

  return sub {
    my $x = shift @_;

    if (($x <= -$preshift) || ($x > $max)) {
      return 0;
    }

    return (($lambda > 0) ?
    ($scale * $lambda * (($x + $preshift) ** ($lambda - 1))) :
    ($scale / ($x + $preshift)));
  };
}


# Now, this is an odd duck.  How do we attempt to fit a polynomial
# to a SINGLE column, when we do not have X-Y pairs?
#
# Well, if the data are independently identically distributed, then
# we have slightly more of a chance than has Hades hosting a snowball
# fight.  The idea there is that we sort the data, treat the scaled
# index as the x-value, and the data as the y-values.
#
# If we _do_ manage to fit a polynomial reasonably well -- say, an
# RMS error of (arbitrarily:  0.1% of median) -- then we should be
# able to build an approximate CDF, which maps values in poly space
# to 0..1 based upon the observed range and the poly.  The PPF must
# then map from 0..1 to the poly.
#
# Argh.  But this all means that, for a k-degree polynomial, we have
# k+3 parameters:  k+1 coefficients, and the minimum and maximum data
# values.

sub polynomial_est($) {
  my $y_ref_unsorted = shift;
  my $ct             = scalar @$y_ref_unsorted;
  my $median         = ArrayFoo::arrSelect($y_ref_unsorted,
  int(0.5*$ct));
  my $max_RMS        = 0.01*$median;  # Completely ad-hoc.

  my $min_degree     = 0;
  my $max_degree     = 5;  # Arbitrary.

  my @y_ref          = sort {$a <=> $b} @$y_ref_unsorted;
  my @x_ref          = ();

  {
    my $i=0;

    for ($i=0; $i < $ct; $i++) {
      $x_ref[$i] = ($i / $ct);
    }
  }

  my ($params, $RMS) = Numerical::polynomial_guess(\@x_ref, \@y_ref,
  $max_degree, $max_RMS);

  if (!defined($params)) {
    # No good fit found.
    return undef;
  }

  my $xf       = polynomial_gen_xf($params);
  my $inverse  = polynomial_gen_inverse($params);

  return ($xf, $params, $inverse);
}



# We have a good polynomial fit that maps from 0..1 to the original
# data.  We need to work backwards.  For that, we'll use Numerical's
# inverse_search method (odd hybrid between Newton's hope-for-locally-
# linear method, and binary search).
#
# The parameters are the polynomial coefficients in descending order
# of degree.
# search boundaries).
sub polynomial_gen_xf($) {
  my $params = shift;
  my @params_copy = @$params;
  my $degree    = scalar(@params_copy) - 1;
  my $max_coeff = $params_copy[0];
  my $step      = 0.05;

  my $fx = Numerical::polynomial_evaluator_gen(\@params_copy);
  my $min       = &$fx(0);

  if ($degree == 0) {
    # Constant.
    return sub {
      my $y = shift @_;
      return 0;
    };
  }

  # At least linear.
  return sub {
    my $y = shift @_;
    if ($y < $min) {
      return 0;
    }

    my $guess = 0;

    my $fx0 = &$fx(0);
    my $fx1 = &$fx(1);
    $guess = ($y - $fx0) / ($fx1 - $fx0);

    my $lo    = $guess - $step;
    my $hi    = $guess + $step;
    my $loval = &$fx($lo);
    my $hival = &$fx($hi);

    if ($loval < $hival) {
      while ($loval > $y) {
        $lo -= $step;
        $loval = &$fx($lo);
      }
      while ($hival < $y) {
        $hi += $step;
        $hival = &$fx($hi);
      }
    } else {
      my $temp = $lo;
      $lo = $hi;
      $hi = $temp;

      $temp = $loval;
      $loval = $hival;
      $hival = $temp;

      while ($loval > $y) {
        $lo += $step;
        $loval = &$fx($lo);
      }
      while ($hival < $y) {
        $hi -= $step;
        $hival = &$fx($hi);
      }
    }

    die unless ($y >= $loval);
    die unless ($y <= $hival);

    my $x = Numerical::inverse_search($fx, $y, $lo, $hi);
    if ($x <= 0) {
      return 0;
    }
    if ($x >= 1) {
      return 1;
    }

    return $x;
  };
}





# We have a good polynomial fit that maps from 0..1 to the original
# data.  We'll use it, quite directly.
#
# The parameters are the polynomial coefficients in descending order
# of degree.
sub polynomial_gen_inverse($) {
  my $params = shift;
  my @params_copy = @$params;

  my $fx = Numerical::polynomial_evaluator_gen(\@params_copy);

  return $fx;
}



# Apply each known transformation.  Estimate parameters.  Evaluate using
# chi-square.
#
# We return:
#
# A text tag indicating the choice of transformation.  This may be

# A number indicating the significance level.  The higher the
# significance level, the lower the rejection threshold and thus
# greater the difficulty.
#
# A list reference containing everything returned by the transformation
# estimator -- namely, the CDF, the parameters, and the PPF.
#
# If there is no good match, we return undef.
#
# This does NOT use the interpolator.
sub best_match($) {
  my $data_ref  = shift @_;
  my $best_sig  = undef;
  my $best_dist = undef;
  my $best_trio = undef;
  my $distref   = undef;
  my $Chi       = new ChiSquare;

  my @dlist = (
    +[ 'logarithmic', \&logarithmic_est ]
  );


  my $bucket_count = 64;

  if ((scalar(@$data_ref) / $bucket_count) < 5) {
    $bucket_count = int(scalar(@$data_ref) / 5);
  }

  foreach $distref (@dlist) {
    my ($name, $estimator) = @$distref;
    my ($cdf_ref, $param_ref, $ppf_ref) = &$estimator($data_ref);

    if (!defined($cdf_ref)) {
      next;
    }

    my $sig = $Chi->chi_square($data_ref, $bucket_count,
    $cdf_ref, scalar @$param_ref);
    if (defined($sig)) {
      if ((!defined($best_dist)) || ($best_sig < $sig)) {
        $best_sig  = $sig;
        $best_dist = $name;
        $best_trio = +[ $cdf_ref, $param_ref, $ppf_ref ];
      }
    }
  }

  if (defined($best_dist)) {
    return ($best_dist, $best_sig, $best_trio);
  } else {
    return undef;
  }
}



# Ugh.  If all else fails, interpolate.  This tries to fit a 100-segment
# piecewise-linear-fit.  Ewwww.
#
# The parameters are merely the data/quantile pairs.
sub interpolator_est($) {
  my $data_ref  = shift;
  my $i         = 0;
  my $ct        = scalar @$data_ref;
  my $segs      = 100;
  my @seg_data  = ();


  if ($ct < 100) {
    $segs = $ct;
  }

  for ($i=0; $i < $segs; $i++) {
    my $idx      = int($i * $ct / $segs);
    my $val      = ArrayFoo::arrSelect($data_ref, $idx);

    my $pair_ref = +[ $val, $i/$segs ];
    push @seg_data, $pair_ref;
  }


  push @seg_data, +[ $data_ref->[-1], 1];
  # Should now have '$segs+1' endpoints.

  my $xf_sub      = interpolator_gen_xf(\@seg_data);
  my $inverse_sub = interpolator_gen_inverse(\@seg_data);

  return ($xf_sub, \@seg_data, $inverse_sub);
};



sub interpolator_gen_xf($) {
  my $param_ref = shift @_;
  my $seg_data  = +[ @$param_ref ];
  my $segs      = (scalar @$seg_data)-1;

  return sub {
    my $x     = shift @_;
    my $i     = 0;
    my $left  = 0;
    my $right = $segs;

    my $lval = $seg_data->[$left];
    my $rval = $seg_data->[$right];

    # Handle extremes.
    if ($lval->[0] >= $x) {
      return 0;
    }

    if ($rval->[0] <= $x) {
      return 1;
    }

    find_interval: {
      if ($lval->[0] == $x) {
        return $lval->[1];
      }

      if ($rval->[0] == $x) {
        return $rval->[1];
      }

      # It's in-between, then.
      if ($left == ($right - 1)) {
        # A single interval; interpolate.
        my $ratio = ($x - $lval->[0]) / ($rval->[0] - $lval->[0]);
        my $diff  = $rval->[1] - $lval->[1];
        return ($lval->[1] + ($ratio * $diff));
      }

      # Guaranteed not to be either left or right, since $right-$left >= 2.
      my $mid    = int(($left+$right)/2);
      my $midval = $seg_data->[$mid];
      my $miditm = $midval->[0];

      if ($miditm < $x) {
        $left = $mid;
        $lval = $midval;
      } elsif ($miditm > $x) {
        $right = $mid;
        $rval = $midval;
      } else {
        return $midval->[1];
      }

      redo find_interval;
    }
  };
}



sub interpolator_gen_inverse($) {
  my $param_ref = shift @_;
  my $seg_data  = +[ @$param_ref ];
  my $segs      = (scalar @$seg_data)-1;

  return sub {
    my $y     = shift @_;
    my $i     = 0;
    my $left  = 0;
    my $right = $segs;

    my $lval = $seg_data->[$left];
    my $rval = $seg_data->[$right];

    # Handle extremes.
    if ($lval->[1] >= $y) {
      return 0;
    }

    if ($rval->[1] <= $y) {
      return 1;
    }

    find_interval: {
      if ($lval->[1] == $y) {
        return $lval->[0];
      }

      if ($rval->[1] == $y) {
        return $rval->[0];
      }

      # It's in-between, then.
      if ($left == ($right - 1)) {
        # A single interval; interpolate.
        my $ratio = ($y - $lval->[1]) / ($rval->[1] - $lval->[1]);
        my $diff  = $rval->[0] - $lval->[0];
        return ($lval->[0] + ($ratio * $diff));
      }

      # Guaranteed not to be either left or right, since $right-$left >= 2.
      my $mid    = int(($left+$right)/2);
      my $midval = $seg_data->[$mid];
      my $miditm = $midval->[1];

      if ($miditm < $y) {
        $left = $mid;
        $lval = $midval;
      } elsif ($miditm > $y) {
        $right = $mid;
        $rval = $midval;
      } else {
        return $midval->[0];
      }

      redo find_interval;
    }
  };
}



return 1;
