/* File:     BoxCount.cc
 * Purpose:  To implement an API for computing pair counts,
 *           and for setting / getting related parameters.
 *
 * RCS:
 ************************************************************
 * $Id: BoxCount.cc,v 1.6 2001/09/03 19:36:11 lw2j Exp $
 * $Log:	BoxCount.cc,v $
// Revision 1.6  2001/09/03  19:36:11  lw2j
// Now allows the choice of DBLayer implementation.
// 
// Revision 1.5  2001/08/23  16:19:14  lw2j
// Added the zero_translate option.
// 
// Revision 1.4  2001/08/22  23:38:45  lw2j
// Fixed odd off-by-one bug using two_tables -- t'was caused by
// the counter trying to fetch_plusplus newdb instead of prev_db,
// (when generating a new prev_db because it had been passed in a
// NULL).
// 
// Revision 1.3  2001/08/22  17:20:18  lw2j
// Untabified.
// 
// Revision 1.2  2001/08/22  16:50:51  lw2j
// Added the two_tables method, which may give more speed
// when increasing the radius at the cost of storage.
// 
 * Revision 1.1  2001/08/22  04:27:01  lw2j
 * Initial revision
 *
 ************************************************************
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <errno.h>
#include <iostream.h>
#include "DBLayer.h"
#include "BoxCount.h"

#if (USE_BERKELEY == 1) 
#include "BerkeleyLayer.h"
#endif

#if (USE_EXTHASH == 1) 
#include "ExtHash.h"
#endif

#if ((USE_BERKELEY + USE_EXTHASH) == 0) 
#error At least one database type for storing counters must be enabled.  
#error Check your Makefile.
#endif

/* We'll need to be storing a linked list of log-log pairs,
 * at least until we figure out how my radii we actually 
 * have.  This structure is declared here since there's no
 * reason to export it.
 */
struct BC_dbl_ll_t {
  BC_dbl_ll_t():log_radius(0), log_count(0), next(NULL) {};
 
  ~BC_dbl_ll_t() {
    /* use on the head only */
    /*    delete next; */
  }
  
  double              log_radius;
  double              log_count;
  struct BC_dbl_ll_t *prev;
  struct BC_dbl_ll_t *next;
};



/* Add to the beginning of the list. */
static inline void add_ll_head(BC_dbl_ll_t *&head, BC_dbl_ll_t *&tail,
                               double log_radius, double log_count) {
  BC_dbl_ll_t *new_node = NULL;

  new_node = new BC_dbl_ll_t;
  new_node->log_radius = log_radius;
  new_node->log_count  = log_count;
  new_node->prev       = NULL;
  new_node->next       = head;

  if (!head) {
    head = new_node;
    tail = new_node;
  } else {
    head->prev = new_node;
    head       = new_node;
  }
}



/* Add to the end of the list. */
static inline void add_ll_tail(BC_dbl_ll_t *&head, BC_dbl_ll_t *&tail,
                               double log_radius, double log_count) {
  BC_dbl_ll_t *new_node = NULL;

  new_node = new BC_dbl_ll_t;
  new_node->log_radius = log_radius;
  new_node->log_count  = log_count;
  new_node->prev       = tail;
  new_node->next       = NULL;

  if (!tail) {
    head = new_node;
    tail = new_node;
  } else {
    tail->next = new_node;
    tail       = new_node;
  }
}




/* Like the verbose function name says.  Perhaps this
 * should be inlined?   No error checking, since it's not
 * exported and shouldn't be getting bad parameters.
 */
static inline void divide_by_radius_and_floor(double* arr, 
                                                  unsigned int count,
                                                  double  radius) {
  unsigned int i=0;

  for (i=0; i < count; i++) {
    *arr = floor((*arr) / radius);
    arr++;
  }
}




/*******************************************************************/


BoxCount::BoxCount():dw(NULL),
                     in_memory(false),
                     two_table(false),                                    
                     base(2),
                     exponent(2),
                     radius_min(0.00000095367431640625), /* 2^-20 */
                     radius_max(262144),                 /* 2^18  */
                     occupancy_max(0.95),
                     singleton_max(0.95),
                     radius_count(39) {
  log_div = log(base);

#if (USE_BERKELEY == 1)
  db_type = DB_Berkeley;
#elif (USE_EXTHASH == 1)
  /* Berkeley DB has undergone more testing, and supports on-disk
   * databases, so it takes priority as a default.
   */
  db_type = DB_ExtHash; 
#endif /* USE_BERKELEY */
}


BoxCount::~BoxCount() {
  /* No need for anything here, really. */
}




/* Which box-counting shall we use?  The standard version --
 * iterate over the data every time -- or a stranger one in
 * which we can iterate over occupied cells?
 */
int BoxCount::compute(unsigned int& point_count, 
                      double *&log_radius_array,
                      double *&log_count_array) {

  double radius_mult = 0;

  /* Some basic sanity checking. */
  if ((!dw) || (base <= 0) || (radius_min >= radius_max) ||
      (radius_count < 1)   || (occupancy_max < 0)        ||
      (occupancy_max > 1)  || (singleton_max < 0)        ||
      (singleton_max > 1)) {
    errno = EINVAL;
    return -1;
  }

  switch(db_type) {
  case DB_Berkeley:
#if (USE_BERKELEY == 1)
    break;
#else
    errno = EINVAL;
    return -1;
#endif
  case DB_ExtHash:
#if (USE_EXTHASH == 1)
    break;
#else
    errno = EINVAL;
    return -1;
#endif
  default:
    errno = EINVAL;
    return -1;
  }

  radius_mult = exp(log(radius_max / radius_min) /
                    (radius_count - 1));

#ifdef PARANOID
  if (radius_mult != floor(radius_mult)) {
    cerr << "Non-integer radius multiplier of " << radius_mult << " is " << endl;
    cerr << "forbidden when the box-counting library is built with" << endl;
    cerr << "-DPARANOID." << endl;
    errno = EINVAL;
    return -1;
  }
#endif /* PARANOID */

  if ((two_table) && (radius_mult == floor(radius_mult))) {
    return compute_all_boxes_two_table(point_count, log_radius_array,
                           log_count_array, radius_mult);
  }

  return compute_all_boxes(point_count, log_radius_array,
                           log_count_array, radius_mult);
}
  


/* The standard approach. */
int BoxCount::compute_all_boxes(unsigned int& point_count, 
                      double *&log_radius_array,
                      double *&log_count_array,
                      double   radius_mult) {
  BC_dbl_ll_t *head = NULL;
  BC_dbl_ll_t *tail = NULL;
  BC_dbl_ll_t *ptr  = NULL;
  double         radius = 0;
  double         s_frac = 0;
  double         o_frac = 0;
  unsigned int   index  = 0;
  double         count  = 0;
#ifdef PARANOID
  /* If it's paranoid mode, this is ALWAYS true because we
   * checked earlier and complained (returned error) if it 
   * wasn't.
   */
  static const bool mult_is_int = true;
#else
  bool           mult_is_int = (radius_mult == floor(radius_mult));
#endif /* PARANOID */
  point_count = 0;


  /* First, try the middle radius. */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index);
  assert(!count_single_radius(radius, count, s_frac, o_frac));

  if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
    add_ll_head(head, tail, log(radius) / log_div, 
                (exponent == 1) ? count : 
                (log(count) / log_div));
    point_count++;
  }


  /* Reduce the radius until ending condition reached.. */
  while ((s_frac <= singleton_max) &&
         (radius > radius_min)) {
    radius /= radius_mult;
    
    assert(!count_single_radius(radius, count, s_frac, o_frac));
    
    if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
      add_ll_head(head, tail, log(radius) / log_div, 
                  (exponent == 1) ? count : 
                  (log(count) / log_div));
      point_count++;
    }


    if ((mult_is_int) && (s_frac == 1)) {
      /* If every object is in its own cell, then reducing
       * the radius further is pointless -- at least, if
       * the radius multiplier is an integer.
       */
      break;
    }
  }

  /* Reset to middle * radius_mult... */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index+1);

  if (radius <= radius_max) {
    assert(!count_single_radius(radius, count, s_frac, o_frac));

    if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
      add_ll_tail(head, tail, log(radius) / log_div, 
                  (exponent == 1) ? count : 
                  (log(count) / log_div));
      point_count++;
    }

    /* Increasing the radius until ending condition reached.. */
    while ((o_frac <= occupancy_max) &&
           (radius < radius_max)) {
      radius *= radius_mult;
      
      assert(!count_single_radius(radius, count, s_frac, o_frac));
      
      if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
        add_ll_tail(head, tail, log(radius) / log_div, 
                    (exponent == 1) ? count : 
                    (log(count) / log_div));
        point_count++;
      }


      if ((mult_is_int) && (o_frac == 1)) {
        /* If every object is in the same cell, then reducing
         * the radius further is pointless, if monotonicity
         * guarantees exist.
         */
        break;
      }
    }
  }

  /* Allocate and fill arrays. */
  log_radius_array = new double[point_count];
  log_count_array  = new double[point_count];
  assert(log_radius_array);
  assert(log_count_array);

  ptr = head;

  for (index=0; index < point_count; index++) {
    log_radius_array[index] = ptr->log_radius;
    log_count_array[index]  = ptr->log_count;

    ptr = ptr->next;
    delete head;
    head = ptr;
  }
  
  return 0;
}          




/* The two_table approach. */
int BoxCount::compute_all_boxes_two_table(unsigned int& point_count, 
                      double *&log_radius_array,
                      double *&log_count_array,
                      double   radius_mult) {
  DBLayer     *prev_table = NULL;
  BC_dbl_ll_t *head = NULL;
  BC_dbl_ll_t *tail = NULL;
  BC_dbl_ll_t *ptr  = NULL;
  double         radius = 0;
  double         s_frac = 0;
  double         o_frac = 0;
  unsigned int   index  = 0;
  double         count  = 0;
#ifdef PARANOID
  /* If it's paranoid mode, this is ALWAYS true because we
   * checked earlier and complained (returned error) if it 
   * wasn't.
   */
  static const bool mult_is_int = true;
#else
  bool           mult_is_int = (radius_mult == floor(radius_mult));
#endif /* PARANOID */
  point_count = 0;


  /* First, try the middle radius. */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index);
  assert(!count_single_radius(radius, count, s_frac, o_frac));

  if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
    add_ll_head(head, tail, log(radius) / log_div, 
                (exponent == 1) ? count : 
                (log(count) / log_div));
    point_count++;
  }


  /* Reduce the radius until ending condition reached.. */
  while ((s_frac <= singleton_max) &&
         (radius > radius_min)) {
    radius /= radius_mult;
    
    assert(!count_single_radius(radius, count, s_frac, o_frac));
    
    if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
      add_ll_head(head, tail, log(radius) / log_div, 
                  (exponent == 1) ? count : 
                  (log(count) / log_div));
      point_count++;
    }


    if ((mult_is_int) && (s_frac == 1)) {
      /* If every object is in its own cell, then reducing
       * the radius further is pointless -- at least, if
       * the radius multiplier is an integer.
       */
      break;
    }
  }

  /* Reset to middle * radius_mult... THIS part differs. */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index+1);

  if (radius <= radius_max) {
    assert(!count_single_radius_two_table(radius, radius_mult, 
                                          count, s_frac, o_frac,
                                          prev_table));
    /* Since there was no previous table, csr2t computed a table
     * with the spec'd radius.  The count, s_frac, and o_frac
     * values correspond to THAT table.  prev_table contains a
     * table with radius*radius_mult.
     */
    if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
      add_ll_tail(head, tail, log(radius) / log_div, 
      (exponent == 1) ? count : 
      (log(count) / log_div));
      point_count++;
    }

    /* Increasing the radius until ending condition reached.. */
    while ((o_frac <= occupancy_max) &&
           (radius < radius_max)) {
      radius *= radius_mult;
      
      assert(!count_single_radius_two_table(radius, radius_mult, 
                                            count, s_frac, o_frac,
                                            prev_table));
      
      if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
        add_ll_tail(head, tail, log(radius) / log_div, 
                    (exponent == 1) ? count : 
                    (log(count) / log_div));
        point_count++;
      }

      
      if ((mult_is_int) && (o_frac == 1)) {
        /* If every object is in the same cell, then reducing
         * the radius further is pointless, if monotonicity
         * guarantees exist.
         */
        break;
      }
    }
  }

  /* Allocate and fill arrays. */
  log_radius_array = new double[point_count];
  log_count_array  = new double[point_count];
  assert(log_radius_array);
  assert(log_count_array);

  ptr = head;

  for (index=0; index < point_count; index++) {
    log_radius_array[index] = ptr->log_radius;
    log_count_array[index]  = ptr->log_count;
    ptr = ptr->next;
    delete head;
    head = ptr;
  }

  /* Delete old table. */  
  if (prev_table) {
    delete prev_table;
  }

  return 0;
}          



/* A workhorse. */
int BoxCount::count_single_radius(double         radius, 
                                  double        &count,
                                  double        &singleton_frac,
                                  double        &occupancy_frac) {
  DBLayer      *count_db    = NULL;
  double       *vec         = NULL; 
  double       *vec_ignored = NULL;
  double        addition    = 0;
  unsigned int  cell_count  = 0;
  unsigned int  index       = 0;
  unsigned int  occ_cells   = 0; 
  unsigned int  occ_max_seen   = 0;
  unsigned int  singleton_count = 0;

  count    = 0;

  switch(db_type) {
#if (USE_BERKELEY == 1) 
  case DB_Berkeley:
    count_db = new BerkeleyLayer(data_dims, in_memory); 
    break;
#endif

#if (USE_EXTHASH == 1)
  case DB_ExtHash:
    count_db = new ExtHash(data_dims, in_memory);
    break;
#endif
    
  default:
    assert(0);
  }
  assert(count_db);

  /* sequential scan */
  for (index=0; index < data_count; index++) {
    vec = dw->get_vector(index);
    
    divide_by_radius_and_floor(vec, data_dims, radius);
    
    if (count_db->fetch_plusplus(vec) == 0) {
      /* errno should already be set to a sane value by
       * the database package 
       */
      delete count_db;
      return -1;
    }
  }

  /* All data has been processed, so count the (exponent)-th
   * moment, except for when exponent=1, when we do something
   * special.
   */
  vec_ignored = new double[data_dims];    

  while (!(count_db->cursor_next(vec_ignored, cell_count))) {
    /* Update occupancy statistics */
    occ_cells++;

    assert(cell_count > 0);

    if (cell_count == 1) {
      singleton_count++;
    }

    if (cell_count > occ_max_seen) {
      occ_max_seen = cell_count;
    }

    if (exponent == 1) {
      /* special case:  entropy */
      double fraction = ((double) cell_count) / data_count;
      
      addition = fraction * log(fraction); 

      /* should always be the case, since fraction is in (0,1] */
      assert(addition <= 0);
      count += addition;

      /* underflow check */
      assert(count <= 0);
    } else {
      /* the branch usually taken */
      addition = pow(cell_count, exponent);

      /* Positive number, taken to a positive power... basically,
       * overflow check.
       */
      assert(addition > 0);
      count += addition;

      /* overflow check */
      assert(count > 0);
    }
  }


  singleton_frac = ((double) singleton_count) / occ_cells;
  occupancy_frac = ((double) occ_max_seen)    / data_count;

  if (exponent == 1) {
    count = count / log_div;
  }

  delete [] vec_ignored;
  delete count_db;

  return 0;
}


/* Compute occupancies for the next table while calculating
 * stats for the previous one.
 */
int BoxCount::count_single_radius_two_table(double         radius, 
                                  double         radius_mult,
                                  double        &count,
                                  double        &singleton_frac,
                                  double        &occupancy_frac,
                                  DBLayer       *&prev_table) {
  DBLayer      *new_db    = NULL;  /* next table */
  double       *vec         = NULL; 
  double       *cell_id = NULL;
  double        addition=0;
  unsigned int  cell_count  = 0;
  unsigned int  index       = 0;
  unsigned int  occ_cells   = 0; 
  unsigned int  occ_max_seen   = 0;
  unsigned int  singleton_count = 0;

  count    = 0;
  switch(db_type) {

#if (USE_BERKELEY == 1) 
  case DB_Berkeley:
    new_db = new BerkeleyLayer(data_dims, in_memory); 
    break;
#endif

#if (USE_EXTHASH == 1) 
  case DB_ExtHash:
    new_db = new ExtHash(data_dims, in_memory);
    break;
#endif

  default:
    assert(0);
  }
  assert(new_db);

  if (!prev_table) {
    /* We don't have a previous table, so we need to generate it. */
  switch(db_type) {

#if (USE_BERKELEY == 1) 
  case DB_Berkeley:
    prev_table = new BerkeleyLayer(data_dims, in_memory); 
    break;
#endif

#if (USE_EXTHASH == 1)
  case DB_ExtHash:
    prev_table = new ExtHash(data_dims, in_memory);
    break;
#endif
  default:
    assert(0);
  }
  assert(prev_table);


    for (index=0; index < data_count; index++) {
      vec = dw->get_vector(index);
      
      divide_by_radius_and_floor(vec, data_dims, radius);
      
      if (prev_table->fetch_plusplus(vec) == 0) {
        /* errno should already be set to a sane value by
         * the database package 
         */
        delete prev_table;
        prev_table = NULL;
        return -1;
      }
    }
  } 

  /* All data has been processed, so count the (exponent)-th
   * moment, except for when exponent=1, when we do something
   * special.
   */
  cell_id = new double[data_dims];    

  while (!(prev_table->cursor_next(cell_id, cell_count))) {
    unsigned int old_count = 0;

    /* Update occupancy statistics */
    occ_cells++;
  
    if (cell_count == 1) {
      singleton_count++;
    }

    if (cell_count > occ_max_seen) {
      occ_max_seen = cell_count;
    }

    if (exponent == 1) {
      /* special case:  entropy */
      double fraction = ((double) cell_count) / data_count;

      addition = fraction * log(fraction);
      assert(addition <= 0);
      count += addition;
      assert(count <= 0);
    } else {
      /* the branch usually taken */
      addition = pow(cell_count, exponent);
      assert(addition >= 0);
      count += addition;
      assert(count > 0);
    }

    /* Add to the correct cell in new_db. */
    divide_by_radius_and_floor(cell_id, data_dims, radius_mult);
    old_count = new_db->fetch(cell_id);
    cell_count += old_count;
    assert((new_db->store(cell_id, cell_count)) == cell_count);
  }


  singleton_frac = ((double) singleton_count) / occ_cells;
  occupancy_frac = ((double) occ_max_seen)    / data_count;

  if (exponent == 1) {
    count = count / log_div;
  }

  delete [] cell_id;
  delete prev_table;

  prev_table = new_db;

  return 0;
}
