#!/usr/local/bin/perl5 -w
#
# Purpose:
#    To provide a robust least-squares line-fitter.
#
#    This was at one point part of DiskFracDim.pm, and before that,
#    was derived from code by Christos.
#
#    Use with robust_lsfit.pl provides a sample invocation.
#
##########################################################################
# $Id: Robust_LSFit.pm,v 1.3 2002/01/03 10:54:12 lw2j Exp $
# $Log:	Robust_LSFit.pm,v $
# Revision 1.3  2002/01/03  10:54:12  lw2j
# Added correlation computation function.  It's not used by the
# package, but might be useful for applications -- they can assess
# correlation over the entire data, and not just the half that's
# used by the line fitter.
# 
# Revision 1.2  2001/08/22  17:21:29  lw2j
# Untabified (tab=8).
#
# Revision 1.1  2001/05/29  18:04:21  lw2j
# Initial revision
#
##########################################################################
#
# Package Header
#
##########################################################################

package Robust_LSFit;
require Exporter;

use strict;

##########################################################################

BEGIN {
    use Exporter ();
    use Symbol;
    use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

    $VERSION     = 1.0;
    @ISA         = qw(Exporter);
    @EXPORT =  qw(
                  compute_correlation
                  robust_lsfit
                  lsfit
                  ssq
                 );
    %EXPORT_TAGS = ();
    @EXPORT_OK   = ();

    return 1;
}




# Compute the correlation of two attributes.  Both arguments
# are array references.  This is not used by this module,
# because we want to compute more than just correlations and
# it is more efficient to do so simultaneously, but it may
# still be useful.
sub compute_correlation($$) {
  my $xref  = shift;
  my $yref  = shift;
  my $x1    = undef;    # $\sum {x}$
  my $x2    = undef;    # $\sum {x}^{2}$
  my $y1    = undef;    # $\sum {y}$
  my $y2    = undef;    # $\sum {y}^{2}$
  my $xy    = undef;    # $\sum ({x} \times{y})$
  my $n     = undef;    # count
  my $i     = undef;    # index
  my $xlen  = scalar @{$xref};
  my $ylen  = scalar @{$yref};
  my $xval  = undef;
  my $yval  = undef;

  my $corr  = undef;

  if ($xlen != $ylen) {
    die "lsfit:  log_r is of length '$xlen', log_s '$ylen'";
  }

  $n = $xlen;

  $x1 = $x2 = $y1 = $y2 = $xy = 0;

  for($i=0; $i<$n; $i++) {
    $xval = $xref->[$i];
    $yval = $yref->[$i];
    $x1 += $xval;
    $y1 += $yval;
    $x2 += $xval * $xval;
    $y2 += $yval * $yval;
    $xy += $xval * $yval;
  }

  if ($n > 1) {
    # correlation coefficient r
    my $xvar = $x2 - ($x1 * $x1 / $n);
    my $yvar = $y2 - ($y1 * $y1 / $n);

    if($xvar == 0) {
      $corr = 1;
    } elsif( $yvar == 0 ){
      $corr = 1;
    } else {
      $corr = ($xy - $x1 * $y1 / $n) / sqrt($xvar) / sqrt($yvar);
    }
  } else {
    # At most one point?!
    $corr = 1;
  }

  return $corr;
}



# Taken from Christos's original fractal dimension package,
# with minor modifications for usage here.
#
# The two arguments are references to the x and y arrays,
# respectively.
#
# This is the NON-robust entry point; IOW, it uses all
# points given in the usual SSQ manner.  If you want the
# iterative robust fitter, you should use robust_lsfit.
#
# Returns (slope, correlation coefficient)
sub lsfit($$) {
  my $xref  = shift;
  my $yref  = shift;
  my $x1    = undef;    # $\sum {x}$
  my $x2    = undef;    # $\sum {x}^{2}$
  my $y1    = undef;    # $\sum {y}$
  my $y2    = undef;    # $\sum {y}^{2}$
  my $xy    = undef;    # $\sum ({x} \times{y})$
  my $n     = undef;    # count
  my $i     = undef;    # index
  my $xlen  = scalar @{$xref};
  my $ylen  = scalar @{$yref};
  my $xval  = undef;
  my $yval  = undef;

  my $a     = undef;
  my $b     = undef;
  my $r     = undef;   # y = ax+b


  if ($xlen != $ylen) {
    die "lsfit:  log_r is of length '$xlen', log_s '$ylen'";
  }

  $n = $xlen;

  $x1 = $x2 = $y1 = $y2 = $xy = 0;

  for($i=0; $i<$n; $i++) {
    $xval = $xref->[$i];
    $yval = $yref->[$i];
    $x1 += $xval;
    $y1 += $yval;
    $x2 += $xval * $xval;
    $y2 += $yval * $yval;
    $xy += $xval * $yval;
  }

  if ($n > 1) {
    $a = ($xy - $x1 * $y1 / $n) / ($x2 - $x1 * $x1 / $n);
    $b = ($y1 - $a * $x1) / $n;

    # correlation coefficient r
    my $xvar = sprintf '%.5f', $x2 - $x1 * $x1 / $n;
    my $yvar = sprintf '%.5f', $y2 - $y1 * $y1 / $n;
    if($xvar == 0) {
      $r = 1;
    } elsif( $yvar == 0 ){
      $r = 1;
    } else {
      $r = ($xy - $x1 * $y1 / $n) / sqrt($xvar) / sqrt($yvar);
    }

    # old version:
    # $r = ($xy - $x1 * $y1 / $n) / sqrt($x2 - $x1 * $x1 / $n)
    # / sqrt($y2 - $y1 * $y1 / $n);
    # print 'slope= ', $a, '    y_intcpt= ', $b, '  corr= ', $r;
  } else {
    # print 'slope= ', 9999, '  y_intcpt= ', 0, '   corr= ', 0;
    $a = 9999; $b=0; $r=0;
  }

  return( $a, $b, $r);
}                               # end lsfit



# Used to be named lts.
#
# This is the ROBUST line-fitter, which uses the regular lsfit()
# function over contiguous subsets of the data.
#
# The two arguments are references to the x and y arrays,
# respectively.
#
sub robust_lsfit($$) {
    my $xaref = shift;

    my $yaref = shift;
    my ($n, $i);
    my $xlen;
    my $ylen;
    my (@smallx, @smally);
    my ($a, $b, $r);   # slope, intercept and corr. coeff.
    my ($start, $end); # offsets of matching portion
    my $q;        # stretch to match = end-start +1

    my $diff;
    my $currentDiff ;
    my $currentStart ;
    my ( $currenta , $currentb , $currentr);


    $xlen = scalar( @$xaref);
    $ylen = scalar( @$yaref);

    $n = $xlen;

    $q = int ( $n/2) +1; #according to Venable + Ripley
    #$q = int($n/2);

    for( $start =0; $start < $n - $q + 1 ; $start ++ ){
        $end = $start + $q -1;

        # also works
        # @smallx = @{ $xaref} [ $start .. $end ];
        # @smally = @{ $yaref} [ $start .. $end ];

        my $i;
        for($i=0; $i<$q; $i++){
            $smallx[$i] = $$xaref[$i+$start];
            $smally[$i] = $$yaref[$i+$start];
        }

        ($a, $b, $r) = lsfit(\@smallx , \@smally );
        # surprisingly, \@smallx etc DOES NOT WORK!
        # $a=$b=$r= 9999999; #TST


        $diff = ssq( \@smallx, \@smally, $a, $b);
        if( !defined($currentDiff) || ($diff < $currentDiff ) ) {
            # new champion, with smaller diffs
            $currentDiff = $diff;
            $currentStart = $start;
            $currenta = $a;
            $currentb = $b;
            $currentr = $r;
        }

    }# end for

   # We return these values, but expect that somebody will
   # print them.
   #
   # y = ($currenta * x) + $currentb
   #
   # $currentr = correlation coefficient between x,y

   return( $currenta,
           $currentb,
           $currentr);

}# end robust_lsfit




# Computes the sum of squared error between a line and a set
# of points.
#
# The first two arguments are references to the x and y
# arrays.  The second two are a and b respectively, from
# the equation 'y=ax+b'.
#

sub ssq($$$$){
    my $xaref = shift;
    my $yaref = shift;
    my $a     = shift;
    my $b     = shift;
    my $xlen;
    my $ylen;
    my $res;

    # parse the parms
    $xlen = scalar( @$xaref);
    $ylen = scalar( @$yaref);

    die unless ($xlen == $ylen);

    my $n = $xlen;
    my $i;
    $res =0;
    for($i=0; $i<$n; $i++){
        my $xval = $$xaref[$i];
        my $yval = $$yaref[$i];
        $res += ($yval - $a* $xval - $b) ** 2;
    }

    return($res);
}

# Needed for require, et al.
return 1;
