/*
* Author:     Leejay Wu
* Started:    2001-Feb-01
* File:       tug.cc
* Purpose:    A Tug-of-War-based correlation fractal
*             dimension estimation class...
*
*             There are quite a few static functions which
*             tend to be a) specialized for this class,
*             b) aren't particularly needed as class members
*             themselves, and c) may make assumptions about
*             calling them, such as not doing much error
*             checking because they're not being called by
*             people who haven't written them.
*
*
* Updated 2003-Mar-10 by Angeline Wong
* hashing functions
*
***************************************************************
* Id:  $Id: tug.cc,v 1.2 2003/09/01 15:21:28 lw2j Exp $
*
* $Log:	tug.cc,v $
// Revision 1.2  2003/09/01  15:21:28  lw2j
// Indentified.
// 
// Revision 1.1  2003/09/01  14:57:15  lw2j
// Initial revision
//
*
*/




#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>


#include "list.h"
#include "array.h"
#include "tug.h"


/* 200 3-tuples of primes <a, b, M> where
* 45,000 < M < 46,340 < a < b
* These are hard-coded for efficiency.
*
* (sqrt(2^31) = 46340.95)
*/
/* NOTE: currently used as 150 sets (tuples) of 4 primes */
int abMprime[600] = {
  46349,48533,44123,
  46351,48539,44129,
  46381,48541,44131,
  46399,48563,44159,
  46411,48571,44171,
  46439,48589,44179,
  46441,48593,44189,
  46447,48611,44201,
  46451,48619,44203,
  46457,48623,44207,
  46471,48647,44221,
  46477,48649,44249,
  46489,48661,44257,
  46499,48673,44263,
  46507,48677,44267,
  46511,48679,44269,
  46523,48731,44273,
  46549,48733,44279,
  46559,48751,44281,
  46567,48757,44293,
  46573,48761,44351,
  46589,48767,44357,
  46591,48779,44371,
  46601,48781,44381,
  46619,48787,44383,
  46633,48799,44389,
  46639,48809,44417,
  46643,48817,44449,
  46649,48821,44453,
  46663,48823,44483,
  46679,48847,44491,
  46681,48857,44497,
  46687,48859,44501,
  46691,48869,44507,
  46703,48871,44519,
  46723,48883,44531,
  46727,48889,44533,
  46747,48907,44537,
  46751,48947,44543,
  46757,48953,44549,
  46769,48973,44563,
  46771,48989,44579,
  46807,48991,44587,
  46811,49003,44617,
  46817,49009,44621,
  46819,49019,44623,
  46829,49031,44633,
  46831,49033,44641,
  46853,49037,44647,
  46861,49043,44651,
  46867,49057,44657,
  46877,49069,44683,
  46889,49081,44687,
  46901,49103,44699,
  46919,49109,44701,
  46933,49117,44711,
  46957,49121,44729,
  46993,49123,44741,
  46997,49139,44753,
  47017,49157,44771,
  47041,49169,44773,
  47051,49171,44777,
  47057,49177,44789,
  47059,49193,44797,
  47087,49199,44809,
  47093,49201,44819,
  47111,49207,44839,
  47119,49211,44843,
  47123,49223,44851,
  47129,49253,44867,
  47137,49261,44879,
  47143,49277,44887,
  47147,49279,44893,
  47149,49297,44909,
  47161,49307,44917,
  47189,49331,44927,
  47207,49333,44939,
  47221,49339,44953,
  47237,49363,44959,
  47251,49367,44963,
  47269,49369,44971,
  47279,49391,44983,
  47287,49393,44987,
  47293,49409,45007,
  47297,49411,45013,
  47303,49417,45053,
  47309,49429,45061,
  47317,49433,45077,
  47339,49451,45083,
  47351,49459,45119,
  47353,49463,45121,
  47363,49477,45127,
  47381,49481,45131,
  47387,49499,45137,
  47389,49523,45139,
  47407,49529,45161,
  47417,49531,45179,
  47419,49537,45181,
  47431,49547,45191,
  47441,49549,45197,
  47459,49559,45233,
  47491,49597,45247,
  47497,49603,45259,
  47501,49613,45263,
  47507,49627,45281,
  47513,49633,45289,
  47521,49639,45293,
  47527,49663,45307,
  47533,49667,45317,
  47543,49669,45319,
  47563,49681,45329,
  47569,49697,45337,
  47581,49711,45341,
  47591,49727,45343,
  47599,49739,45361,
  47609,49741,45377,
  47623,49747,45389,
  47629,49757,45403,
  47639,49783,45413,
  47653,49787,45427,
  47657,49789,45433,
  47659,49801,45439,
  47681,49807,45481,
  47699,49811,45491,
  47701,49823,45497,
  47711,49831,45503,
  47713,49843,45523,
  47717,49853,45533,
  47737,49871,45541,
  47741,49877,45553,
  47743,49891,45557,
  47777,49919,45569,
  47779,49921,45587,
  47791,49927,45589,
  47797,49937,45599,
  47807,49939,45613,
  47809,49943,45631,
  47819,49957,45641,
  47837,49991,45659,
  47843,49993,45667,
  47857,49999,45673,
  47869,50021,45677,
  47881,50023,45691,
  47903,50033,45697,
  47911,50047,45707,
  47917,50051,45737,
  47933,50053,45751,
  47939,50069,45757,
  47947,50077,45763,
  47951,50087,45767,
  47963,50093,45779,
  47969,50101,45817,
  47977,50111,45821,
  47981,50119,45823,
  48017,50123,45827,
  48023,50129,45833,
  48029,50131,45841,
  48049,50147,45853,
  48073,50153,45863,
  48079,50159,45869,
  48091,50177,45887,
  48109,50207,45893,
  48119,50221,45943,
  48121,50227,45949,
  48131,50231,45953,
  48157,50261,45959,
  48163,50263,45971,
  48179,50273,45979,
  48187,50287,45989,
  48193,50291,46021,
  48197,50311,46027,
  48221,50321,46049,
  48239,50329,46051,
  48247,50333,46061,
  48259,50341,46073,
  48271,50359,46091,
  48281,50363,46093,
  48299,50377,46099,
  48311,50383,46103,
  48313,50387,46133,
  48337,50411,46141,
  48341,50417,46147,
  48353,50423,46153,
  48371,50441,46171,
  48383,50459,46181,
  48397,50461,46183,
  48407,50497,46187,
  48409,50503,46199,
  48413,50513,46219,
  48437,50527,46229,
  48449,50539,46237,
  48463,50543,46261,
  48473,50549,46271,
  48479,50551,46273,
  48481,50581,46279,
  48487,50587,46301,
  48491,50591,46307,
  48497,50593,46309,
  48523,50599,46327,
  48527,50627,44119
};


/* Comparing two doubles is dangerous on certain platforms.
* Since 0 is not necessarily <= 0, we add the FFF to the
* right-hand side on certain checks.
*/
static const double FLOAT_FUDGE_FACTOR = 0.000001;


static int   gcd(int a, int b);
static void  choose_four_primes(int &a, int &b, int &c, int &d, int r);
static int   compar_dbl(const void *a, const void *b);
static double is_it_horizontal(Array<double> &y,
unsigned int offset,
unsigned int length);

/* compute SSQ error */
static double ssq(Array<double>& xs,
Array<double>& ys,
double &a,
double &b,
unsigned int offset,
unsigned int length);


static void lsfit(Array<double> &xs,
Array<double> &ys,
unsigned int offset,
unsigned int length,
double &a,
double &b,
double &corr);



/* Apply Euclid's algorithm to find the GCD.
*
*   a=b*q+r  --> b=r*q'+r' etc.
*
*   terminate when r=0, returning b
*/

static int gcd(int a, int b) {
  int q = 0;
  int r = 0;

  if (a < 0) {
    a *= -1;
  } else if (a==0) {
    return 0;
  }

  if (b < 0) {
    b *= -1;
  } else if (b==0) {
    return 0;
  }

  if (a < b) {
    a ^= b;
    b ^= a;
    a ^= b;
  }

  q = a/b;
  r = a % b;

  if (r==0) {
    return b;
  } else {
    return(gcd(b,r));
  }
}



/* Find three relatively prime random numbers.
*
* This is surprisingly fast -- I would have thought that picking
* three relatively prime numbers out of a hat might take a great
* many tries.  *shrug*  I'm not a mathematician, 'tho.
*
*/
static void choose_three(int &a, int &b, int &c) {
  while(1) {
    /* make 'em odd, just for speed, because evens will get
    * annoying
    */
    a = 2*(rand()/2)+1;
    b = 2*(rand()/2)+1;
    c = 2*(rand()/2)+1;

    if ((gcd(a,b) == 1) &&
    (gcd(a,c) == 1) &&
    (gcd(b,c) == 1)) {
      break;
    }
  }
}


/* Get four random prime numbers.
*
* Returns: ID of hashing function (for use with result matrix)
*
* added by Angeline Wong
*/
#define NUMFUNCS 150

static void choose_four_primes(int &a, int &b, int &c, int &d, int r) {
  a = abMprime[3*r];
  b = abMprime[(3*r)+1];
  c = abMprime[(3*r)+2];
  d = abMprime[(3*r)+3];
}

/* Compare two doubles for qsort. */
static int   compar_dbl(const void *a, const void *b) {
  double  a_val = *((double*) a);
  double  b_val = *((double*) b);

  if (a_val < b_val) {
    return -1;
  } else if (a_val > b_val) {
    return 1;
  }

  return 0;
}



/* The basic constructor. */
TugApprox::TugApprox(void) {
  radius_min   = pow(2, -9);   /* both min and max are inclusive */
  radius_max   = pow(2, 10);
  radius_count = 20;           /* in a geometric series */
  s1           = 24;
  s2           = 24;
  random_seed  = 0;
  silent       = false;

  /* Initialize parity table.  -1 means odd number of 1's.
  *
  * We do this so we don't have to shift bit-by-bit for every byte
  * for every tag -- since we need to compute a LOT of parities.
  * The table could be enlarged if memory isn't important compared
  * to speed.
  */
  {
    for (int idx=0; idx < 256; idx++) {
      int temp    = idx;
      int parity  = 1;

      while (temp > 0) {
        if (temp & 0x1) {
          parity *= -1;
        }

        temp = temp >> 1;
      }

      parity_table[idx] = parity;
    }
  }
}


/* Return the estimated fractal dimension, the points found,
* and the parameters of the line.
*/
double TugApprox::frac_dim(Wrapper &data,
Array<double>& log_radii,
Array<double>& log_counts,
double &a,
double &b,
double &corr) {
  assert(!compute_logs(data, log_radii, log_counts));

  trim_pseudo_flat(log_radii, log_counts);

  robust_fit(log_radii, log_counts, a, b, corr);

  return a;
}



#define MODM % 46337

/*****************************************************************
* METHOD AND DEBUGGING CONTROL
****************************************************************/
#define HASH_PARITY
#define MED_AVG

/*****************************************************************
* TugApprox::compute_logs
****************************************************************/
/* The most expensive part. */
int TugApprox::compute_logs(Wrapper &data,
Array<double> &log_radii,
Array<double> &log_counts) {
  static double log2       = log(2);

  double rad_ratio  = 0;
  long *rg_counters = NULL;
  int  *rg_keys     = NULL;
  #ifdef PRODUCE_MATRIX
  long *hash_matrix = NULL;
  #endif

  srand(random_seed);

  /* a few basic sanity checks */
  assert(data.get_file());

  assert(s1 > 0);
  assert(s2 > 0);

  assert(0 < radius_min);
  assert(radius_min < radius_max);
  assert(radius_count > 1);

  rad_ratio = exp((1.0/(radius_count-1)) * log(radius_max/radius_min));
  log_radii.clear();
  log_counts.clear();

  log_radii.unfixSize();
  log_counts.unfixSize();

  if (!silent) {
    printf("Initializing random functions.\n");
  }

  /* Initialize the counters and functions.
  */

  {
    unsigned int s1_idx = 0;
    unsigned int s2_idx = 0;
    unsigned int rd_idx = 0;

    /* calloc zeroes the memory */
    assert(rg_counters = (long*) calloc(s1*s2*radius_count, sizeof(long)));

    /* Since s1, s2, and radius_count are generally quite small,
    * this space is easily affordable.
    */
    assert(rg_keys = (int*) malloc(4*s1*s2*radius_count*sizeof(int)));

    #ifdef PRODUCE_MATRIX
    assert(hash_matrix =
    (long *)malloc(2*s1*s2*radius_count*sizeof(long)));
    memset(hash_matrix, 0, 2*s1*s2*radius_count*sizeof(long));
    #endif

    int r = 0;

    for (rd_idx=0; rd_idx < radius_count; rd_idx++) {
      if (!silent) {
        printf("*");
        fflush(stdout);
      }
      for (s2_idx=0; s2_idx < s2; s2_idx++) {
        for (s1_idx=0; s1_idx < s1; s1_idx++) {
          int a=0;
          int b=0;
          int c=0;
          int d=0;

          choose_four_primes(a,b,c,d,r);
          rg_keys[4*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)]   = a MODM;
          rg_keys[4*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)+1] = b MODM;
          rg_keys[4*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)+2] = c MODM;
          rg_keys[4*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)+3] = d MODM;
          r = (r+1) % NUMFUNCS;
        }
      }
    }
  }

  if (!silent) {
    printf("\n");
    printf("Processing file...\n");
  }

  /* Iterate through the file. */
  {
    Array<double> vec;
    int          *tag_arr = NULL;
    long          idx=0;
    int           err=0;
    int           dims = data.get_dimensionality();

    #ifdef DEBUG
    printf("dims: %d\n", dims);
    #endif

    data.reset();   /* just to be sure */

    assert((tag_arr = (int*) malloc(sizeof(int)*dims)));

    do {
      vec = data.get_next_object(idx, err);

      if (!err) {
        unsigned int rd_idx  = 0;
        unsigned int key_idx = 0;
        unsigned int ctr_idx = 0;

        double       rd_cur  = radius_min;


        if (!silent) {
          if (!(idx % 100)) {
            /* Convince the user, or the programmer for that matter,
            * that we are making progress.  We can't express this as
            * a percentage since counting the number of vectors takes
            * either a different file format -- such as one using a
            * header -- or another pass, which would be wasteful since
            * we don't need it otherwise.
            */
            printf("*");
            fflush(stdout);
          }
        }

        /* For each radius... */
        for (rd_idx=0; rd_idx < radius_count; rd_idx++, rd_cur *= rad_ratio) {
          unsigned int s1_idx    = 0;
          unsigned int s2_idx    = 0;
          unsigned int dim_idx   = 0;


          /* Identify the cell by generating a vector of longs,
          * one per dimension.
          *
          * Place an orthogonally aligned grid in space for
          * box-counting, with each cell size along each
          * dimension being the current radius.
          *
          * Cell tags are composed of ints, so we should
          * beware underflow/overflow.
          */

          for (dim_idx=0; dim_idx < (unsigned int) dims; dim_idx++) {
            tag_arr[dim_idx] = (int) (floor(vec[dim_idx]/rd_cur));
          }

          for (s2_idx=0; s2_idx < s2; s2_idx++) {
            for (s1_idx=0; s1_idx < s1; s1_idx++) {
              int rg_key_a = rg_keys[key_idx++];
              int rg_key_b = rg_keys[key_idx++];
              int rg_key_c = rg_keys[key_idx++];
              int rg_key_d = rg_keys[key_idx++];

              unsigned int coeff_a = rg_key_a;
              unsigned int coeff_b = rg_key_b;
              unsigned int coeff_c = rg_key_c;
              unsigned int coeff_d = rg_key_d;

              unsigned int result = 0;

              for (dim_idx=0; dim_idx < (unsigned int) dims; dim_idx++) {
                int x = (tag_arr[dim_idx]) MODM;

                /* d^i */
                result = (result + coeff_d) MODM;
                assert(result < 46337);
                coeff_d = (coeff_d * rg_key_d) MODM;

                /* c^i*x */
                result = (result + ((coeff_c * x) MODM)) MODM;
                assert(result < 46337);
                coeff_c = (coeff_c * rg_key_c) MODM;

                /* b^i*x^2 */
                int xsq = (x*x) MODM;
                result = (result + ((coeff_b * xsq) MODM)) MODM;
                assert(result < 46337);
                coeff_b = (coeff_b * rg_key_b) MODM;

                /* a^i*x^3 */
                int xcube = (xsq*x) MODM;
                result = (result + ((coeff_a * xcube) MODM)) MODM;
                assert(result < 46337);
                coeff_a = (coeff_a * rg_key_a) MODM;

              }

              #ifdef HASH_M_2
              /** hash: M/2 **/
              {
                if (result > (unsigned)(rg_key_C/2))
                rg_counters[ctr_idx] += 1;
                else rg_counters[ctr_idx] += -1;
              }
              #endif
              #ifdef HASH_PARITY
              /** hash: parity **/
              {
                int parity = 1;

                while (result) {
                  parity *= parity_table[(result & 0xFF)];
                  result  = result >> 8;
                }

                rg_counters[ctr_idx] += parity;

                #ifdef PRODUCE_MATRIX
                if (parity == 1){
                  hash_matrix[2*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)]++;
                }
                else{
                  hash_matrix[2*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)+1]++;
                }
                #endif
              }
              #endif

              ctr_idx++;

            } /* inner loop: for (s1_idx=0; s1_idx < s1; s1_idx++) */
          } /* outer loop: for (s2_idx=0; s2_idx < s2; s2_idx++) */
        } /* for each radius... */
      }
    } while (!err);

    free(tag_arr);
  }

  #ifdef PRODUCE_MATRIX
  if (!silent){
    printf("\n");
    printf("Producing counter matrix.\n");
  }
  unsigned int s1_idx = 0;
  unsigned int s2_idx = 0;
  unsigned int rd_idx = 0;
  for (rd_idx = 0; rd_idx < radius_count; rd_idx++){
    for (s2_idx = 0; s2_idx < s2; s2_idx++){
      for (s1_idx = 0; s1_idx < s1; s1_idx++){

        printf("%ld:%ld ", hash_matrix[2*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)],
        hash_matrix[2*((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)+1]);

        #if 0
        printf("%ld ", rg_counters[((rd_idx*s1*s2)+(s2_idx*s1)+s1_idx)]/2);
        #endif
        fflush(stdout);
      } /* s1 */
      printf("\n");
    } /* s2 */
    printf("\n");
  } /* radius */
  free(hash_matrix);
  #endif

  free(rg_keys);

  if (!silent) {
    printf("\n");
    printf("Computing statistics.\n");
  }

  /* We have our counters.  Whew!  We now need to compute,
  * for each radius, s2 means of s1 squares.   And then
  * find their medians.  And then add logs to the parallel
  * arrays.
  */
  {
    unsigned int rd_idx  = 0;
    unsigned int ctr_idx = 0;
    double      *means   = NULL;
    double       rd_cur  = radius_min;

    assert(means = (double*) malloc(sizeof(double)*s1));

    #ifdef MED_AVG
    printf("radius\t\t ");
    printf("Z^2\n");
    #endif

    for (rd_idx=0; rd_idx < radius_count; rd_idx++, rd_cur *= rad_ratio) {
      unsigned int s2_idx    = 0;

      log_radii[rd_idx] = log(rd_cur) / log2;

      memset((char*) means, 0, sizeof(double)*s1);

      for (s2_idx=0; s2_idx < s2; s2_idx++) {
        unsigned int s1_idx    = 0;

        for (s1_idx=0; s1_idx < s1; s1_idx++) {
          int val = rg_counters[ctr_idx++];

          /* The cast is because empirically, we sometimes
          * overflowed a 32-bit int, and ended up with a
          * negative means[s2_idx], which breaks things
          * rather badly (in theory; actually, the robust
          * line-fitter may not care that much).
          */
          means[s2_idx] += (((double) val)*val);
        }

        means[s2_idx] /= s1;
      }

      /* Now we need to find the median.  We do this by sorting
      * and checking the middle item (or averaging the middle
      * two if an even number of 'em); this cost should be
      * rather minor compared to everything above so we should
      * be OK without using a linear select algorithm.
      */

      qsort((void*) means, s2, sizeof(double), compar_dbl);

      #ifdef MED_AVG
      printf("%f\t ", rd_cur);
      if (s2 % 1) {
        printf("%f\n", means[s2/2]);
      } else {
        printf("%f\n", 0.5*(means[s2/2]+means[(s2-1)/2]));
      }
      #endif

      if (s2 % 1) {
        log_counts[rd_idx] = log(means[s2/2])/log2;
      } else {
        log_counts[rd_idx] = 0.5*(log(means[s2/2])+log(means[(s2-1)/2]))/log2;
      }
    }

    free(means);
  }

  free(rg_counters);

  if (!silent) {
    printf("Logs computed.\n");
  }

  return 0;
}





/* Compute and return the SSQ error positing a horizontal
* line.
*
* Does not do range/error checking.
*/
static double is_it_horizontal(Array<double> &y,
unsigned int offset,
unsigned int length) {
  unsigned int i    = 0;
  double   mean_y   = 0;
  double   sum_err  = 0;
  double   *y_start = ((double*) (y.data())) + offset;
  double   *y_temp  = y_start;
  double   err      = 0;

  /* don't be THAT silly */
  assert(length > 1);

  for (i=0; i < length; i++) {
    mean_y += *y_temp++;
  }
  mean_y /= length;

  y_temp = y_start;

  for (i=0; i < length; i++) {
    err      = (*y_temp++ - mean_y);
    sum_err += (err*err);
  }

  return sum_err;
}



/* Given a subrange of two parallel arrays x and y,
* the subrange specified by the 0-based offset and
* the number of points (length) to use, perform
* least-squares linear regression to find a and b
* such that
*
*    y is darn close to ax+b
*
* This is not public because it's not meant for
* general use.  More general linefitters should
* work directly with (double*) arrays.
*
* The correlation coefficient between x and y is
* also computed (only on the given subrange).
*
* Don't use with fewer than 2 points.  And don't
* give it all identical points, either.
*
* Vertical lines result in NANs for a and b.  But
* they shouldn't happen in this code...
*/

static void lsfit(Array<double> &xs,
Array<double> &ys,
unsigned int offset,
unsigned int length,
double &a,
double &b,
double &corr) {
  const double* px     = xs.data();
  const double* py     = ys.data();
  double        sum_x  = 0;
  double        sum_y  = 0;
  double        sum_xx = 0;
  double        sum_yy = 0;
  double        sum_xy = 0;
  double        length_var_x    = 0;
  double        length_var_y    = 0;
  double        length_covar_xy = 0;
  unsigned int  i      = 0;

  px += offset;
  py += offset;

  /* Don't be silly. */
  assert(length > 1);


  for (i=0; i < length; i++) {
    double x = *px++;
    double y = *py++;

    sum_x  += x;
    sum_y  += y;
    sum_xx += x*x;
    sum_yy += y*y;
    sum_xy += x*y;
  }

  /* Two *would* be the single-variable variances, but
  * they need to be divided by another (length) first.
  * That's why they're named like that.
  *
  * Likewise, length_covar_xy is the length times
  * the covariance of x and y.
  */
  length_var_x    = (sum_xx - ((sum_x * sum_x)/length));
  length_var_y    = (sum_yy - ((sum_y * sum_y)/length));
  length_covar_xy = (sum_xy - ((sum_x * sum_y)/length));

  if (!length_var_x) {
    /* No variance in X, ergo vertical line... unless...
    * ...the user somehow gave us all-identical points.
    * Don't do that.  And if it happens in THIS class
    * (remember:  private function) it *is* a bug.
    */

    assert(length_var_y);

    /* I want a NAN.  But, since I'm using -pedantic,
    * just dividing by 0 (or a constant expression that
    * results in that) generates warnings via gcc.
    *
    * Here's to avoiding them.
    */
    a    = 1/(a-a);
    b    = 1/(b-b);
    corr = 1;
  } else if (!length_var_y) {
    /* No variance in Y, ergo horizontal line.  This is
    * also very likely a bug, albeit it is theoretically
    * possible -- but ridiculously improbable -- that
    * the random noise would turn a non-horizontal line
    * into one that is EXACTLY horizontal (well, within
    * the precision of a double.  So maybe this SHOULD
    * produces a termination-by-assertion.
    */

    a    = 0;
    b    = (ys.data())[offset];
    corr = 1;
  } else {
    /* A non-orthogonally-aligned line.  Good. */
    a    = length_covar_xy / length_var_x;
    b    = (sum_y  - (a * sum_x)) / length;
    corr = length_covar_xy / (sqrt(length_var_x * length_var_y));
  }
}




void TugApprox::trim_pseudo_flat(Array<double> &xs,
Array<double> &ys) {
  unsigned int count = xs.getCount();
  unsigned int half  = count/2;
  double       range = 0;
  if (count < 10) {
    return;
  }

  {
    /* find range of points */
    double low_y  = ys[0];
    double high_y = ys[0];
    unsigned i=0;

    for (i=1; i < count; i++) {
      double val = ys[i];

      if (val < low_y) {
        low_y = val;
      }

      if (val > high_y) {
        high_y = val;
      }
    }

    range = high_y - low_y;
  }

  {
    /* find region at beginning */
    double   best_score = is_it_horizontal(ys, 0, 3) / pow(3, 1.5);
    unsigned int best_length = 3;
    unsigned int length      = 3;

    for (length=4; length <= half; length++) {
      double this_score = is_it_horizontal(ys, 0, length) / (pow(length,1.5));

      if (this_score <= (best_score+FLOAT_FUDGE_FACTOR)) {
        best_score  = this_score;
        best_length = length;
      }
    }

    if ((best_length >= 3) || (best_score < (0.05*range))) {
      if (!silent) {
        printf("Trimming:  %d points off left\n", best_length);
      }
      xs    <<= best_length;
      ys    <<= best_length;
      count  -= best_length;
    }
  }

  {
    /* find region at end */
    double   best_score = is_it_horizontal(ys, count-4, 3) / pow(3, 1.5);
    unsigned int best_length = 3;
    unsigned int length      = 3;

    for (length=4; length <= half; length++) {
      double this_score = is_it_horizontal(ys, count-(length+1),
      length) / pow(length, 1.5);

      if (this_score <= (best_score+FLOAT_FUDGE_FACTOR)) {
        best_score  = this_score;
        best_length = length;
      }
    }

    if ((best_length >= 3) || (best_score < (0.05*range))) {
      if (!silent) {
        printf("Trimming:  %d points off right\n", best_length);
      }
      xs.resize(count - best_length);
      ys.resize(count - best_length);
    }
  }
}



/* Robust fitting. */
void TugApprox::robust_fit(Array<double> &xs,
Array<double> &ys,
double &a,
double &b,
double &corr) {
  unsigned int count = xs.getCount();
  unsigned int half  = (count/2)+1;
  double   new_a = 0;
  double   new_b = 0;
  double   new_corr = 0;
  double   best_ssq = 0;
  double   this_ssq = 0;

  if (count < 5) {
    /* don't bother with a full robust fit */
    lsfit(xs, ys, 0, count, a, b, corr);
  } else {
    unsigned int i=0;

    for (i=0; i < (count-half); i++) {

      lsfit(xs, ys, i, half, new_a, new_b, new_corr);

      this_ssq = ssq(xs, ys, new_a, new_b, i, half);

      if ((this_ssq < best_ssq) || (!i)) {
        best_ssq = this_ssq;
        a    = new_a;
        b    = new_b;
        corr = new_corr;
      }
    }
  }
}



static double ssq(Array<double>& xs,
Array<double>& ys,
double &a,
double &b,
unsigned int offset,
unsigned int length) {
  double *px  = (double*) ((xs.data()) + offset);
  double *py  = (double*) ((ys.data()) + offset);
  double  tot = 0;
  double  err = 0;

  unsigned int i = 0;

  for (i=0; i < length; i++) {
    err  = *py++ - ((a * (*px++)) + b);
    tot += err*err;
  }

  return tot;
}
