: # Find perl...
eval 'exec perl5 -w -S $0 "$@"'
if 0;

#!/usr/bin/perl5 -w
#
############################################################
# Major architectural change, 2001/05/23:
#
# Renamed to Pair_Count.pm.
#
# Flat-trimming code moved to TrimFlat.p[ml]
# Line-fitting code moved to Robust_LSFit.p[ml]
#
# DiskFracDim.pm is now a shell interface that mostly
# mirrors the Pair_Count interface for compatibility
# reasons, plus references the flat-trimming and
# line-fitting.
#
# Verbose option moved to the (new) DiskFracDim.pm
# to retain the old semantics (listing the *trimmed*
# log/log points, which are no longer accessible
# at this level).
#
# The logarithms are still base 2.
#
############################################################
# $Id: Pair_Count.pm,v 1.9 2002/01/03 10:53:35 lw2j Exp $
#
# $Log:	Pair_Count.pm,v $
# Revision 1.9  2002/01/03  10:53:35  lw2j
# Untabified.
# 
# Revision 1.8  2001/08/22  17:28:07  lw2j
# Dropped obsoleted variables.
#
# Revision 1.7  2001/08/22  17:21:29  lw2j
# Untabified (tab=8).
#
# Revision 1.6  2001/08/06  10:21:28  lw2j
# Fixed just-introduced double-incrementing idx bugs (came in with
# loop termination changes...), and also reset radius to (middle-1)
# in all_pairs_int before reducing it.
#
# Revision 1.5  2001/08/03  11:20:26  lw2j
# Changed occupancy_max and singleton_max to o_max and s_max, for
# consistency reasons.
#
# Revision 1.4  2001/08/03  11:10:09  lw2j
# Added support for maximum occupancy (in single cell) checking.
# Differentiated between, and cleaned up code for, determining
# whether log-log data should be recorded (i.e. is it within
# singleton/densest limits?) and whether loops should continue
# (within the appropriate occupancy limit, and radius limits?).
#
# Revision 1.3  2001/08/02  16:25:06  lw2j
# Added support for cutting off the radius reduction once there
# are too many singletons (objects in their own cells).  In
# addition, the log-log data will not be recorded too early
# when the "midpoint" is in fact well below the minimum
# meaningful radius (either by the singleton rule, or the
# absolute cutoff (EVERYBODY in single cells)).
#
# Revision 1.2  2001/05/29  18:19:25  lw2j
# Removed some "defined(@array)" checks as they're deprecated.
#
# Revision 1.1  2001/05/29  18:04:21  lw2j
# Initial revision
#
# Revision 1.7  2001/02/28  13:25:19  lw2j
# Changed
#   unshift @log_r,  log($radius);
# to
#   unshift @log_r,  log($radius)/$log2;
#
# in line 640 of DiskFracDim.pm; it must have been missed when
# converting logs to log_{2}.
#
# Revision 1.6  2001/01/26  12:09:14  lw2j
# Fixed typo in comment for lts().
#
# Revision 1.5  2000/11/12  14:41:06  lw2j
# Changed cell ID computation scheme; it now uses packed ulongs
# instead of ASCII.  Max dimensionality should then be a bit higher.
#
# Revision 1.4  2000/10/24  18:00:46  lw2j
# Now supports $q=1.  Logarithms now base 2.
#
# Revision 1.3  2000/10/06  19:19:44  lw2j
# Fixed bugs involving q != 2.  Barred q=1, for now.
#
# Revision 1.2  2000/10/03  14:08:01  lw2j
# Added magic perl invoker.
#
# Revision 1.1  2000/07/18  13:27:11  lw2j
# Initial revision
#
############################################################
#
# Purpose:
#  To provide simple code for computing the 'correlation' fractal
#  dimension $D_{q}$.
#
#  It's {\em very} simple code, so the data is expected to be accessed
#  via a wrapper, for reasons of flexibility.  This code is
#
#                   ***** NOT *****
#
#  optimized for speed in an industrial sense.
#
#  That is, the module accepts as input an object reference.  This
#  object reference must support:
#
#     get_count($)       # return how many items there are
#     get_dims($)        # return embedding dimensionality
#     get_obj($$)        # return the object by 0-based index, as an
#                        # array of scalars
#
#  The object should not change while we're processing it...
#
#  This object ref will be 'data_o' below.
#
#  Other parameters that should be worried about:
#
#     method             # 'fast' (inexact box-count) or 'slow'
#                        # (iterate over pairwise distances, but
#                        #  more exact; DO NOT USE if you care
#                        # about speed)
#     r_min              # minimum radius for our approximation
#     r_max              # maximum radius for our approximation
#     r_count            # maximum number of radii to use (>= 1)
#     intervals          # if defined (default 20) and the method
#                        # is 'slow', the log-distance axis is
#                        # divided into this many intervals, and
#                        # the point with the LOWEST pair-count is
#                        # chosen for the line-fitting if any
#                        # exists.
#     q                  # normally 2, the degree
#     s_max              # ratio (0..1) for the *max* fraction
#                        # of occupied cells being singletons.
#                        # That is, the radius will STOP being
#                        # decreased if this fraction of all
#                        # objects in the database have their
#                        # own cells.  To emulate older behavior,
#                        # set this to 1.
#                        #
#                        # The purpose of this is to avoid certain
#                        # nasty side-effects of having very close
#                        # (worst:  coincident) objects, especially
#                        # when most of the objects are NOT close.
#                        # The flat-trimming code is not designed
#                        # to handle multiple flat regions, and yet
#                        # such a data set will often result in
#                        # such a "mesa" between the lowest
#                        # meaningful radius and the largest "tiny"
#                        # range.  This in turn could result in an
#                        # obviously bogus FD of exactly 0.
#
#     o_max              # ratio (0..1) for the maximum occupancy
#                        # of any given grid cell, occupancy defined
#                        # as the percentage of objects included
#                        # out of all the data.  This value is meant
#                        # to prevent certain errors (similar to
#                        # singleton_max, but probably far less
#                        # likely) involving *very* distant outliers.
#
#
#  *** NOTE ***
#
#     This module has been modified significantly; it now stops
#     the computation upon calculating the log-log points.  In
#     other words, it no longer removes flat regions, calculates
#     the least-squares fit, or actually give the fractal dimension
#     itself.  Therefore, another program/module needs to compute
#     the actual FD.  When $q=2, it's equal to the pair-count
#     slope.  When $q=1$, ditto.  When $q is something else,
#     the slope needs to be divided by $q-1.
#
#
#  The default values tend to be fairly sane, but if the data
#  you have has ranges significantly different than mine, you
#  may want to adjust the radii (for instance).  See
#  'set_default_params'.
#
#  Logarithms, incidentally, should now be base 2 throughout.
#  In cases where the scaling applies to both axes, it does
#  not matter from the point of determining fractal dimension --
#  since it does not affect the slope -- but it does matter
#  in terms of absolute numbers, and for $q=1 (based on
#  entropy, which normally uses bits).
#
#  The methods of greatest concern are new, set_params, clear
#  and fracdim.
#
#  Sample usage:
#    See 'FracTest.pl', which attempts to load a data file 'tri'.
#
#
#
#  WARNING re: high-dimensional sets:
#     Old "cell ID" computations -- needed in box_put and
#     box_put_int -- have changed format.
#
#     It used to be that a cell, which corresponds to an
#     N-tuple of integers, with each dimension ranging from
#        (0, ..., (max-min)/radius)
#     was identified simply by the concatenation of this
#     tuple separated by single spaces.
#
#     This has now changed to native-format unsigned longs
#     (i.e. binary format), in order to resolve an apparent
#     issue involving many dimensions; there may be a limit
#     on the size of a DBM key.  In any event, this change
#     seems to have fixed anomalous behavior regarding not
#     being able to retrieve a key/value pair that had just
#     been fixed.
#
#     If so, this will still break for many dimensions
#     depending on the limit, in which case one may need to
#     generate something like an MD5 hash.  These message-
#     digest functions, however, are not 1:1 and therefore
#     doing so discards any guarantee of correctness.  In
#     addition, this breaks the optimizations (namely the
#     box_put_int + all_boxes_int cycle) because it depends
#     on being able to invert the function, so that it can
#     iterate over keys/value pairs instead of data items.
#
#
##########################################################################
#
# Package Header
#
##########################################################################

package Pair_Count;
require Exporter;


##### <change_me>
#
# Change this to where you placed the modules.
#
use lib '/afs/cs.cmu.edu/user/lw2j/shared/FracDim/FracDim';
##### </change_me>

use strict;
use DB_File;
use POSIX;
use mktemp;      # temporary file support (for boxcount database)

##########################################################################

BEGIN {
    use Exporter ();
    use Symbol;
    use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

    $VERSION     = 1.0;
    @ISA         = qw(Exporter);
    @EXPORT =  qw(
                  new
                  clear
                  get_param
                  set_params
                  pair_count

                  set_default_params

                  compute_ranges
                  box_put
                  box_count
                  all_boxes
                  all_boxes_int
                  all_pairs

                  filter_points
                 );
    %EXPORT_TAGS = ();
    @EXPORT_OK   = ();

    return 1;
}



# new:  creates blessed Pair_Count objects with default params
#
# Standard usage:
#
# my $fracObj = Pair_Count::new();
#
# Extended usage:
#
# my $fracObj = Pair_Count::new( param => value, param => value, ... ).
sub new($@) {
    my $class = shift;
    my $self  = +{};

    bless $self, $class;

    $self->clear();
    $self->set_default_params();
    $self->set_params(@_);

    # An on-disk btree is used to store box-counts in the 'fast'
    # method -- it has to be on-disk, because in higher
    # dimensionalities with large data sets the size of the hash
    # table could get *very* large.

    $DB_BTREE->{cachesize}=100000;

    return $self;
}



# Will die() unless asked for a VALID parameter (Value just
# needs to exist; it can be undef).

sub get_param($$) {
    my $self = shift;
    my $pnam = shift;

    # THIS much at least should exist...
    die unless exists($self->{'params'});

    # As should the parameter.
    if (!exists($self->{'params'}->{$pnam})) {
        die "get_param() queried for bogus parameter:  '$pnam'";
    } else {
        return ($self->{'params'}->{$pnam});
    }
}


sub set_default_params($) {
    my $self = shift;

    # default:  2^-20 to 2^18, should result in a multiplier of 2
    #
    # I *highly* advise choosing values that give integral
    # multipliers, as various optimizations can't happen
    # otherwise...
    #
    # ...and, actually, certain theoretical guarantees (such
    # as monotonicity!) will break if you don't.
    $self->{'params'} =
        +{
            'q'        => 2,
            normalize  => 0,
            r_min      => 0.00000095367431640625,
            r_max      => 262144,
            r_count    => 39,
            intervals  => 20,
            method     => 'fast',
            data_o     => undef,
            s_max      => 0.95,
            o_max      => 0.95
         };
}


# Forget about old data.  If you only call 'new' once, and
# use set_params to switch data objects, you'll want to
# clear() the old data out of there.
#
# Standard usage:
#
# $fdObj->clear();
sub clear($) {
    my $self = shift;

    # Forget about the old data.
    if (exists($self->{'data'})) {
        if (exists($self->{'data'}->{'old_dbref'})) {
            delete $self->{'data'}->{'old_dbref'};
            untie %{$self->{'data'}->{'old_boxdata'}};
            untemp $self->{'data'}->{'old_tnam'};
        }
    }

    $self->{'data'}       = +{};

    return $self;
}




# Set zero or more parameters, doing some minimal sanity checking.
#
# Standard usage:
#
# $fdObj->set_params('q' => 2, r_count => 200);
sub set_params($@) {
    my $self  = shift;
    my $param = undef;
    my $arg   = undef;

    while (defined($param = shift)) {
        $arg = shift;

        if (!exists($self->{'params'}->{$param})) {
            die "Unknown param/arg pair '$param':'$arg'";
        }

        $self->{'params'}->{$param} = $arg;
    }

    if (exists($self->{'params'}->{'method'})) {
        my $method = $self->{'params'}->{'method'};

        if (($method ne 'fast') && ($method ne 'slow')) {
            die "Parameter 'method' must be either 'fast' or 'slow'.";
        }

        if ($method eq 'slow') {
            if (defined($self->{'params'}->{'intervals'}) &&
                ($self->{'params'}->{'intervals'} < 10)) {

                # Lower this threshold if you REALLY want to.  Do NOT use
                # just 1, however.
                die "Using below 10 intervals is REALLY not recommended if you're doing the full pairwise method anyway.";
            }
        }
    } else {
        die "The 'method' parameter is mandatory.";
    }

    return $self;
}


# Returns two references corresponding to (copied) arrays
# containing the log_radius and log_count data.
#
# The data is not automatically clear()-ed afterwards.
sub pair_count($) {
  my $self  = shift;

  $self->compute_ranges();

  if ($self->{'params'}->{'method'} eq 'fast') {
      $self->all_boxes();
  } elsif ($self->{'params'}->{'method'} eq 'slow') {
      $self->all_pairs();
  } else {
      die "You really shouldn't reach this point.";
  }

  if ($self->{'params'}->{'method'} eq 'slow') {
      if (defined($self->{'params'}->{'intervals'})) {
          $self->filter_points();
      }
  }

  my @log_radius_copy = @{$self->{'data'}->{'log_r'}};
  my @log_count_copy = @{$self->{'data'}->{'log_s'}};

  return (\@log_radius_copy, \@log_count_copy);
}


# Compute the minimum and maximum for each dimension.
# It's required for normalization purposes.
#
# Standard usage:
#
# $fdObj->compute_ranges();
sub compute_ranges($) {
    my $self   = shift;
    my $dims   = undef;
    my $data_o = undef;
    my $count  = undef;
    my $obj_idx = undef;

    my @lows    = ();
    my @highs   = ();

    if (!exists($self->{'params'}->{'data_o'})) {
        die "Corrupted object, doesn't even have an undef data_o";
    }

    if (!exists($self->{'params'}->{'data_o'})) {
        die "Incomplete object, needs a defined data_o reference.";
    }

    $data_o = $self->{'params'}->{'data_o'};
    $dims   = $data_o->get_dims();
    $count  = $data_o->get_count();

    $self->{'data'}->{'dims'}  = $dims;
    $self->{'data'}->{'count'} = $count;

    for ($obj_idx = 0; $obj_idx < $count; $obj_idx++) {
        my @obj_data = undef;
        my $dim_idx  = undef;

        @obj_data = $data_o->get_obj($obj_idx);

        (scalar(@obj_data) == $dims) || die "Unexpected size for obj $obj_idx";

        if ($obj_idx == 0) {
            @lows = @obj_data;
            @highs = @obj_data;
        } else {
            for ($dim_idx=0; $dim_idx < $dims; $dim_idx++) {
                if ($lows[$dim_idx] > $obj_data[$dim_idx]) {
                    $lows[$dim_idx] = $obj_data[$dim_idx];
                }

                if ($highs[$dim_idx] < $obj_data[$dim_idx]) {
                    $highs[$dim_idx] = $obj_data[$dim_idx];
                }
            }
        }
    }

    $self->{'data'}->{'range_low'}  = \@lows;
    $self->{'data'}->{'range_high'} = \@highs;

    return $self;
}




# Compute the box-count $q$-powered occupancy for a given radius.
# This is the non-optimized version.
#
# Standard usage:
#
# $fdObj->box_count(0.2);
#
# It stores the $S_{q}(r)$ value inside the object.
sub box_put($$) {
    my $self   = shift;
    my $radius = shift;
    my $data_o = undef;
    my $dims   = undef;
    my $count  = undef;
    my $q      = undef;
    my @highs  = undef;
    my @lows   = undef;

    my %table   = ();
    my $normalize = $self->{'params'}->{'normalize'};
    my $obj_idx = undef;

    if (!exists($self->{'data'}->{'box'})) {
        $self->{'data'}->{'box'} = +{};
    }


    my ($tfh, $tnam) = mktemp();
    my $dbh = tie %table, 'DB_File', $tnam, O_RDWR|O_TRUNC, 0700, $DB_BTREE;


    if (!exists($self->{'params'}->{'data_o'})) {
        die "Corrupted object, doesn't even have an undef data_o";
    }

    if (!exists($self->{'params'}->{'data_o'})) {
        die "Incomplete object, needs a defined data_o reference.";
    }

    $data_o = $self->{'params'}->{'data_o'};
    $q      = $self->{'params'}->{'q'};
    $dims   = $data_o->get_dims();
    $count  = $data_o->get_count();

    @lows   = @{$self->{'data'}->{'range_low'}};
    @highs  = @{$self->{'data'}->{'range_high'}};

    for ($obj_idx=0; $obj_idx < $count; $obj_idx++) {
        my @obj_data = $data_o->get_obj($obj_idx);
        my $cell_tag = undef;
        my $dim_idx  = undef;
        my @tag_data = ();

        (scalar(@obj_data) == $dims) || die "Unexpected size for obj $obj_idx";

        if ($normalize != 0) {
            # normalize to a hypercube
            for ($dim_idx=0; $dim_idx < $dims; $dim_idx++) {
                if ($lows[$dim_idx] == $highs[$dim_idx]) {
                    $obj_data[$dim_idx] = 0;
                } else {
                    $obj_data[$dim_idx] =
                        ($obj_data[$dim_idx] - $lows[$dim_idx]) /
                        ($highs[$dim_idx] - $lows[$dim_idx]);
                }
            }
        }

        # compute a cell tag
        for ($dim_idx=0; $dim_idx < $dims; $dim_idx++) {
            $tag_data[$dim_idx] = pack("L", floor(($obj_data[$dim_idx]
                                         - $lows[$dim_idx])
                                        / $radius));
        }

        $cell_tag = join('', @tag_data);

        my $temp;

        if ($dbh->get($cell_tag, $temp)) {
            $table{$cell_tag} = 1;
        } else {
            $table{$cell_tag}++;
        }
    }

    $self->{'data'}->{'boxdata'} = \%table;
    $self->{'data'}->{'dbref'}   = $dbh;
    $self->{'data'}->{'tnam'}    = $tnam;

    $dbh->sync();
}


# Call AFTER a box_put, to find out how many were in each cell.
#
# The box occupancy data is PRESERVED, in the 'old' slot.  This
# permits a certain optimization when the radius multiplier is an
# integer.
#
# If $q==1, this should return -H, where H is the entropy /
# Shannon information value, in bits.
#
# If $q!=1, it returns the sum of the $q-powers of the
# occupancies.
#
# ADDITION:
#   When in list context, it returns the numbers of singletons
#   and occupied cells, respectively, as well as the total.
#
# ADDITION:
#   It'll also now report the *highest* number of objects in
#   any given cell.  So it's
#    (total, singletons, total, highest)
#   if in a LIST context.

sub box_count($$) {
    my $self   = shift;
    my $radius = shift;
    my $cell   = undef;
    my $occ    = undef;
    my $tref   = $self->{'data'}->{'boxdata'};
    my $dbh    = $self->{'data'}->{'dbref'};
    my $q      = $self->{'params'}->{'q'};
    my $total  = 0;

    my $singletons  = 0;
    my $occupied    = 0;
    my $densest = 0;

    defined($tref) || die 'Failed to actually count boxes...';

    if ($q != 1) {
        while (($cell, $occ) = each %{$tref}) {
            my $buf;
            if (!($dbh->get($cell, $buf))) {
                $total += pow($occ, $q);

                if ($occ == 1) {
                    $singletons++;
                }

                $densest = ($densest > $occ) ? $densest : $occ;

                $occupied++;
            }
        }
    } else {
        # q=1 is based on entropy.  Negative entropy, actually.
        my $count = $self->{'data'}->{'count'};

        while (($cell, $occ) = each %{$tref}) {
            my $buf;

            if (!($dbh->get($cell, $buf))) {
                my $prob = $occ / $count;
                $total += $prob * log($prob);

                if ($occ == 1) {
                    $singletons++;
                }

                $densest = ($densest > $occ) ? $densest : $occ;
                $occupied++;
            }
        }

        $total *= 1 / log(2);
    }

    defined($self) || die;
    defined($radius) || die;
    defined($total) || die;

    $self->{'box'}->{$radius} = $total;

    if (exists($self->{'data'}->{'old_dbref'})) {
        delete $self->{'data'}->{'old_dbref'};
        untie %{$self->{'data'}->{'old_boxdata'}};
        untemp $self->{'data'}->{'old_tnam'};
    }

    $self->{'data'}->{'old_dbref'}   = $self->{'data'}->{'dbref'};
    $self->{'data'}->{'old_boxdata'} = $self->{'data'}->{'boxdata'};
    $self->{'data'}->{'old_tnam'}    = $self->{'data'}->{'tnam'};

    delete $self->{'data'}->{'dbref'};
    delete $self->{'data'}->{'boxdata'};
    delete $self->{'data'}->{'tnam'};

    # BUGFIX:  It used to return $self, not $total (as
    # suggested by the comments).

    return wantarray() ?
        ($total, $singletons, $occupied, $densest) :
        $total;
}




# Compute the box-counts for every applicable radius.
#
# Standard usage:
#
# $fdObj->all_boxes();
#
# It stores the log/log data...
sub all_boxes($) {
    my $self        = shift;
    my $radius_min  = $self->{'params'}->{'r_min'};
    my $radius_max  = $self->{'params'}->{'r_max'};
    my $radius_ct   = $self->{'params'}->{'r_count'};
    my $q           = $self->{'params'}->{'q'};
    my $radius_mult = 1;
    my $radius      = $radius_min;
    my $idx         = 0;
    my @log_r       = ();
    my @log_s       = ();
    my $log2        = log(2);
    my $singleton_max = $self->{'params'}->{'s_max'};
    my $densest_max   = $self->{'params'}->{'o_max'};

    if ($radius_ct > 1) {
        $radius_mult = exp(log($radius_max / $radius_min) / ($radius_ct - 1));
    } else {
        die "Trying to compute a fractal dimension with but a single radius is rather pointless.";
    }

    # Hey, it's an integer multiplier.  Take advantage of grid
    # alignment optimizations.
    if ($radius_mult == int($radius_mult)) {
        return $self->all_boxes_int();
    }

    # WARNING:
    #
    # *NOT* using an integral multiplier breaks certain theoretical
    # guarantees.  This occurs because the grids do NOT overlap in
    # such a way that they nest perfectly.  If they did, then any
    # two objects, once in the same cell, are in the same cell for
    # every larger radius in the series.  Without this guarantee,
    # the log-log series is not necessarily even monotonic, let
    # alone accurate.

    my @radii = ();

    # Compute all radii.
    for ($idx=0; $idx < $radius_ct; $idx++, $radius *= $radius_mult) {
        push @radii, $radius;
    }

    my $mid     = int($radius_ct/2);

    # If $q != 1:
    #
    #   We will stop reducing the radius when every object is in the
    #   same cell.  Regardless of $q, the summation is $count since
    #   it's $\sum 1^{q}$.
    #
    #   We stop increasing when they're all in the same box -- hence,
    #   $count^{q}$.
    #
    # If $q == 1:
    #    box_count returns the *negative* entropy.
    #
    #   When the radius is small enough, again every object is in
    #   same cell.  Then, the summation is
    #
    #       log_2 (1/$count);
    #
    #   When every object is in the same cell, then it's 0
    #  (log_2 1)


    my $count   = $self->{'data'}->{'count'};  # all in separate boxes
    my $min_bc  = $count;                      # all in separate boxes
    my $max_bc  = pow($count, $q);             # all in one box

    if ($q==1) {
        $min_bc = log(1/$count)/$log2;
        $max_bc = 0;
    }

    $idx = $mid;

    # Starting in the middle, decrease radius until all points are
    # in individual cells and stay that way (grid alignment issues),
    # or we hit the minimum.
    find_low:  {
        $radius = $radii[$idx];

        $self->box_put($radius);
        my ($singletons,
            $occupied,
            $densest) = ($self->box_count($radius))[1,2,3];

        my $s = $self->{'box'}->{$radius};

        # Record the log-log numbers... if it's within the
        # occupancy criteria.  Hopefully we'll reduce nasty
        # outlier effects.

        if (($singletons <= ($occupied * $singleton_max)) &&
            ($densest    <= ($count    * $densest_max))) {
            if ($q != 1) {
                unshift @log_s,  log($s)/$log2;
            } else {
                unshift @log_s,  $s;
            }
            unshift @log_r,  log($radius)/$log2;
        }

        # We'll terminate when the singleton criteria is met,
        # or we run out of radii (at the lower extreme).
        # The '$s == $min_bc' criteria should be obsolete,
        # but we'll leave it in in case the $singleton_max
        # value is somehow confused.

        if (($s == $min_bc) ||
            ($idx == 0)     ||
            ($singletons > ($occupied * $singleton_max))) {
            last find_low;
        } else {
            $idx--;
            redo find_low;
        }
    }

    $idx = $mid+1;

    # Increase until they're all in a single cell, or we hit the max.
    if ($idx != $radius_ct) {
      find_high:  {
          $radius = $radii[$idx];

          $self->box_put($radius);
          my ($singletons, $occupied, $densest) =
              ($self->box_count($radius))[1,2,3];

          my $s = $self->{'box'}->{$radius};


          # Record the log-log numbers... if it's within the
          # occupancy criteria.  Hopefully we'll reduce nasty
          # outlier effects.

          if (($singletons <= ($occupied * $singleton_max)) &&
              ($densest    <= ($count    * $densest_max))) {
              if ($q != 1) {
                  push @log_s,  log($s)/$log2;
              } else {
                  push @log_s,  $s;
              }
              push @log_r,  log($radius)/$log2;
          }

          # Should I stay or should I go?

          if (($s == $max_bc) || (($idx+1) == $radius_ct) ||
              ($densest > ($count * $densest_max))) {
              last find_high;
          } else {
              $idx++;
              redo find_high;
          }
      }
    }

    $self->{'data'}->{'log_r'} = \@log_r;
    $self->{'data'}->{'log_s'} = \@log_s;

    if (exists($self->{'data'}->{'old_dbref'})) {
        delete $self->{'data'}->{'old_dbref'};
        untie %{$self->{'data'}->{'old_boxdata'}};
        untemp $self->{'data'}->{'old_tnam'};
    }

    return $self;
}




# If we have an integer radius multiplier, and normalize the same
# data in the same way... and we have the previous box count data,
# we can generate the next one faster -- iterate over occupied
# cells instead of objects.  This'll work because with an integer
# multiplier, we can overlay larger grids on the existing grid
# pattern.
#
# It requires, and assumes, that the radius is the previous *
# the radius multiplier -- otherwise this optimization is
# impossible.
sub box_put_int($) {
    my $self       = shift;
    my $cell       = undef;
    my $occ        = undef;
    my $old_tref   = $self->{'data'}->{'old_boxdata'};
    my $old_dbh    = $self->{'data'}->{'old_dbref'};
    my $q          = $self->{'params'}->{'q'};
    my %table      = ();

    my $radius_mult = $self->{'data'}->{'radius_mult'};

    defined($old_tref) || die 'Failed to actually count boxes...';

    my ($tfh, $tnam) = mktemp();
    my $dbh = tie %table, 'DB_File', $tnam, O_RDWR|O_TRUNC, 0700, $DB_BTREE;

    while (($cell, $occ) = each %{$old_tref}) {
        my $buf;

        if (!($old_dbh->get($cell, $buf))) {
            # occupied cell

            my @cell_bytes = split '', $cell;
            my @cell_idx   = ();

            while (scalar(@cell_bytes) > 0) {
                my $val = join('', splice(@cell_bytes, 0, 4));

                push @cell_idx, pack("L", floor(join('',
                       unpack("L", $val)) / $radius_mult));
            }

            my $newbuf;
            my $newcell = join '', @cell_idx;

            if ($dbh->get($newcell, $newbuf)) {
                $table{$newcell}  = $buf;
            } else {
                $table{$newcell} += $buf;
            }
        }
    }

    $self->{'data'}->{'boxdata'} = \%table;
    $self->{'data'}->{'dbref'}   = $dbh;
    $self->{'data'}->{'tnam'}    = $tnam;

    $dbh->sync();

    return $self;
}




# This version is ONLY meant for those with integral radius_mult.
#
# It's *almost* identical to all_boxes, except
#
# a) it needs to store radius_mult in 'data';
# b) it 'primes the pump' with one box_put/box_count with the
#    middle radius
# c) when INCREASING the radius, it can take advantage of grid
#    alignment coincidence and use box_put_int (which iterates
#    over occupied cells, rather than raw database objects).
sub all_boxes_int($) {
    my $self        = shift;
    my $radius_min  = $self->{'params'}->{'r_min'};
    my $radius_max  = $self->{'params'}->{'r_max'};
    my $radius_ct   = $self->{'params'}->{'r_count'};
    my $q           = $self->{'params'}->{'q'};
    my $radius_mult = 1;
    my $radius      = $radius_min;
    my $idx         = 0;
    my @log_r       = ();
    my @log_s       = ();
    my $log2        = log(2);
    my $singleton_max = $self->{'params'}->{'s_max'};
    my $densest_max   = $self->{'params'}->{'o_max'};


    if ($radius_ct > 1) {
        $radius_mult = exp(log($radius_max / $radius_min) / ($radius_ct - 1));
    } else {
        die "Trying to compute a fractal dimension with but a single radius is rather pointless.";
    }


    # Needed by box_put_int.
    $self->{'data'}->{'radius_mult'} = $radius_mult;

    my @radii = ();

    # Compute all radii.
    for ($idx=0; $idx < $radius_ct; $idx++, $radius *= $radius_mult) {
        push @radii, $radius;
    }



    # If $q != 1:
    #
    #   We will stop reducing the radius when every object is in the
    #   same cell.  Regardless of $q, the summation is $count since
    #   it's $\sum 1^{q}$.
    #
    #   We stop increasing when they're all in the same box -- hence,
    #   $count^{q}$.
    #
    # If $q == 1:
    #    box_count returns the *negative* entropy.
    #
    #   When the radius is small enough, again every object is in
    #   same cell.  Then, the summation is
    #
    #       log_2 (1/$count);
    #
    #   When every object is in the same cell, then it's 0
    #  (log_2 1)

    my $count   = $self->{'data'}->{'count'};  # all in separate boxes
    my $min_bc  = $count;                      # all in separate boxes
    my $max_bc  = pow($count, $q);             # all in one box

    if ($q==1) {
        $min_bc = log(1/$count)/$log2;
        $max_bc = 0;
    }

    # DIFFERENT:  box_put_int needs to do one 'normal' invocation
    # to prime the pump.
    #

    my $mid     = int($radius_ct/2);
    $idx=$mid;

    {
        $radius=$radii[$idx];

        $self->box_put($radius);
        my ($singletons, $occupied, $densest) =
            ($self->box_count($radius))[1,2,3];

        my $s = $self->{'box'}->{$radius};

        # Record it if it's within the occupancy limits.
        if (($singletons <= ($occupied * $singleton_max)) &&
            ($densest    <= ($count    * $densest_max))) {
            if ($q != 1) {
                unshift @log_s,  log($s)/$log2;
            } else {
                unshift @log_s,  $s;
            }
            unshift @log_r,  log($radius)/$log2;
        }
    }

    $idx++;

    # Increase until they're all in a single cell, or we hit the max.
    #
    # We go forwards, first, using all_boxes_int, to take advantage
    # of perfect grid alignment.
    #
    if ($idx != $radius_ct) {
      find_high:  {
          $radius = $radii[$idx];

          # box_put_int iterates over occupied cells instead
          # of objects.  This, in theory, should be significantly
          # faster, since the occupied cells should be far
          # fewer.
          $self->box_put_int($radius);
          my ($singletons, $occupied, $densest) =
              ($self->box_count($radius))[1,2,3];

          my $s = $self->{'box'}->{$radius};


          # Record the log-log numbers... if it's within the
          # occupancy criteria.  Hopefully we'll reduce nasty
          # outlier effects.

          if (($singletons <= ($occupied * $singleton_max)) &&
              ($densest    <= ($count    * $densest_max))) {
              if ($q != 1) {
                  push @log_s,  log($s)/$log2;
              } else {
                  push @log_s,  $s;
              }
              push @log_r,  log($radius)/$log2;
          }

          # Should I stay or should I go?

          if (($s == $max_bc) || (($idx+1) == $radius_ct) ||
              ($densest > ($count * $densest_max))) {
              last find_high;
          } else {
              $idx++;
              redo find_high;
          }
      }
    }

    $idx = $mid-1;

    # Starting in the middle, decrease radius until all points are
    # in individual cells and stay that way (grid alignment issues),
    # or we hit the minimum.

    if ($idx > 0) {
      find_low:  {
          $radius = $radii[$idx];

          $self->box_put($radius);
          my ($singletons,
            $occupied,
              $densest) = ($self->box_count($radius))[1,2,3];

          my $s = $self->{'box'}->{$radius};

          # Record the log-log numbers... if it's within the
          # occupancy criteria.  Hopefully we'll reduce nasty
          # outlier effects.

          if (($singletons <= ($occupied * $singleton_max)) &&
              ($densest    <= ($count    * $densest_max))) {
              if ($q != 1) {
                  unshift @log_s,  log($s)/$log2;
              } else {
                  unshift @log_s,  $s;
              }
              unshift @log_r,  log($radius)/$log2;
          }

          # We'll terminate when the singleton criteria is met,
          # or we run out of radii (at the lower extreme).
          # The '$s == $min_bc' criteria should be obsolete,
          # but we'll leave it in in case the $singleton_max
          # value is somehow confused.

          if (($s == $min_bc) ||
              ($idx == 0)     ||
              ($singletons > ($occupied * $singleton_max))) {
              last find_low;
          } else {
              $idx--;
              redo find_low;
          }
      }
    }

    $self->{'data'}->{'log_r'} = \@log_r;
    $self->{'data'}->{'log_s'} = \@log_s;

    if (exists($self->{'data'}->{'old_dbref'})) {
        delete $self->{'data'}->{'old_dbref'};
        untie %{$self->{'data'}->{'old_boxdata'}};
        untemp $self->{'data'}->{'old_tnam'};
    }

    return $self;
}



# If the packages is invoked with
#
#   method => 'slow'
#
# we compute all pairwise distances and sort them to generate
# log/log data.  This is both slow and memory-intensive, as we
# no longer rely on temporary files, and will end up doing an
# in-memory sort of all pairwise distances.
#
# The parameter $q is ignored; this is aimed soley at equivalence
# to the fractal dimension.
#
# Straight Euclidean distance is used.
#
# Just to emphasize:
#
#     **** *    **** *   *
#     *    *    *  * *   *
#     **** *    *  * * * *
#        * *    *  * * * *
#     **** **** ****  * *
#
# DO NOT USE on anything except small data sets, unless you have
# a lot of time and memory.

sub all_pairs($) {
    my $self      = shift;
    my $ct        = $self->{'data'}->{'count'};
    my $dims      = $self->{'data'}->{'dims'};
    my $data_obj  = $self->{'params'}->{'data_o'};
    my @distances = ();
    my $i         = 0;
    my $zero    = 0;
    my $log2    = log(2);

    for ($i=0; $i < ($ct-1); $i++) {
        my @i_obj = $data_obj->get_obj($i);
        my $j     = $i+1;

        for ($j=$i+1; $j < $ct; $j++) {
            my $dist  = 0;
            my @j_obj = $data_obj->get_obj($j);
            my $k     = 0;

            for ($k=0; $k < $dims; $k++) {
                $dist += ($i_obj[$k] - $j_obj[$k]) ** 2;
            }

            $dist = sqrt($dist);

            if ($dist == 0) {
                $zero++;
            } else {
                push @distances, $dist;
            }
        }
    }

    @distances = sort { $a <=> $b } @distances;

    my $cumulative = $zero;
    my $dist       = 0;

    my $log_r_ref = ($self->{'data'}->{'log_r'} = +[]);
    my $log_s_ref = ($self->{'data'}->{'log_s'} = +[]);

    distloop:  {
        if (scalar(@distances) < 1) {
            last distloop;
        }

        my $this_dist = shift @distances;
        $cumulative++;

      matchloop:  {
          if (scalar(@distances) < 1) {
              last matchloop;
          }

          if ($distances[0] == $this_dist) {
              $cumulative++;
              shift @distances;
              redo matchloop;
          }

          push @$log_r_ref, log($this_dist)/$log2;
          push @$log_s_ref, log($cumulative)/$log2;
      }

        redo distloop;
    }

    return $self;
}




# If the method is 'slow' (namely, pairwise evaluation), we expect
# a certain density bias towards the upper end.  This function
# attempts to alleviate that by dividing the data into intervals
# according to log(pairwise distance), and picking at most one
# point from each interval.

sub filter_points($) {
    my $self  = shift;
    my @log_r = @{$self->{'data'}->{'log_r'}};
    my @log_s = @{$self->{'data'}->{'log_s'}};
    my $intervals = $self->{'params'}->{'intervals'};

    defined($intervals) || die "Shouldn't be here without defining param.";

    (scalar(@log_r) > 0) || die "Sanity check failed!";
    (scalar(@log_s) == (scalar(@log_r))) ||
     die "Sanity check failed!";

    my $min_dist  = $log_r[0];
    my $max_dist  = $log_r[-1];
    my @new_log_r = ($log_r[0]);
    my @new_log_s = ($log_s[0]);
    my $interval  = 1;
    my $idx       = 1;
    my $max       = scalar(@log_r);


    if ($intervals < 2) {
        # Why are you doing this???  WHY??

        $self->{'data'}->{'log_r'} = \@new_log_r;
        $self->{'data'}->{'log_s'} = \@new_log_s;
        return $self;
    }

    my $int_step  = ($max_dist - $min_dist) / $intervals;
    my $int_min   = $int_step + $min_dist;

    for ($interval=1, $idx=1; ($interval < $intervals) &&
         ($idx < $max); $idx++) {

        my $log_r = $log_r[$idx];
        my $log_s = $log_s[$idx];

        if ($log_r >= $int_min) {
            push @new_log_r, $log_r;
            push @new_log_s, $log_s;

          find_next_interval:  {
              $int_min += $int_step;
              $interval++;
              if ($log_r >= $int_min) {
                  redo find_next_interval;
              }
          }
        }
    }

    $self->{'data'}->{'log_r'} = \@new_log_r;
    $self->{'data'}->{'log_s'} = \@new_log_s;
    return $self;
}




