/* File:     CrossCount.cc
 * Purpose:  To implement an API for computing pair counts,
 *           and for setting / getting related parameters.
 *
 * RCS:
 ************************************************************
 * $Id: CrossCount.cc,v 1.1 2002/01/03 10:58:31 lw2j Exp $
 * $Log:	CrossCount.cc,v $
// Revision 1.1  2002/01/03  10:58:31  lw2j
// Initial revision
// 
// Revision 1.6  2001/09/03  19:36:11  lw2j
// Now allows the choice of DBLayer implementation.
//
// Revision 1.5  2001/08/23  16:19:14  lw2j
// Added the zero_translate option.
//
// Revision 1.4  2001/08/22  23:38:45  lw2j
// Fixed odd off-by-one bug using two_tables -- t'was caused by
// the counter trying to fetch_plusplus newdb instead of prev_db,
// (when generating a new prev_db because it had been passed in a
// NULL).
//
// Revision 1.3  2001/08/22  17:20:18  lw2j
// Untabified.
//
// Revision 1.2  2001/08/22  16:50:51  lw2j
// Added the two_tables method, which may give more speed
// when increasing the radius at the cost of storage.
//
 * Revision 1.1  2001/08/22  04:27:01  lw2j
 * Initial revision
 *
 ************************************************************
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <errno.h>
#include <iostream.h>
#include "DBLayer.h"
#include "CrossCount.h"

#if (USE_BERKELEY == 1)
#include "BerkeleyLayer.h"
#endif

#if (USE_EXTHASH == 1)
#include "ExtHash.h"
#endif

#if ((USE_BERKELEY + USE_EXTHASH) == 0)
#error At least one database type for storing counters must be enabled.
#error Check your Makefile.
#endif


#ifndef MAX2
#define MAX2(__DEFMAC_A, __DEFMAC_B) (((__DEFMAC_A) >= (__DEFMAC_B)) ? (__DEFMAC_A) : (__DEFMAC_B))
#endif



/* We'll need to be storing a linked list of log-log pairs,
 * at least until we figure out how my radii we actually
 * have.  This structure is declared here since there's no
 * reason to export it.
 */
struct BC_dbl_ll_t {
  BC_dbl_ll_t():log_radius(0), log_count(0), next(NULL) {};

  ~BC_dbl_ll_t() {
    /* use on the head only */
    /*    delete next; */
  }

  double              log_radius;
  double              log_count;
  struct BC_dbl_ll_t *prev;
  struct BC_dbl_ll_t *next;
};



/* Add to the beginning of the list. */
static inline void add_ll_head(BC_dbl_ll_t *&head, BC_dbl_ll_t *&tail,
                               double log_radius, double log_count) {
  BC_dbl_ll_t *new_node = NULL;

  new_node = new BC_dbl_ll_t;
  new_node->log_radius = log_radius;
  new_node->log_count  = log_count;
  new_node->prev       = NULL;
  new_node->next       = head;

  if (!head) {
    head = new_node;
    tail = new_node;
  } else {
    head->prev = new_node;
    head       = new_node;
  }
}



/* Add to the end of the list. */
static inline void add_ll_tail(BC_dbl_ll_t *&head, BC_dbl_ll_t *&tail,
                               double log_radius, double log_count) {
  BC_dbl_ll_t *new_node = NULL;

  new_node = new BC_dbl_ll_t;
  new_node->log_radius = log_radius;
  new_node->log_count  = log_count;
  new_node->prev       = tail;
  new_node->next       = NULL;

  if (!tail) {
    head = new_node;
    tail = new_node;
  } else {
    tail->next = new_node;
    tail       = new_node;
  }
}




/* Like the verbose function name says.  Perhaps this
 * should be inlined?   No error checking, since it's not
 * exported and shouldn't be getting bad parameters.
 */
static inline void divide_by_radius_and_floor(double* arr,
                                                  unsigned int count,
                                                  double  radius) {
  unsigned int i=0;

  for (i=0; i < count; i++) {
    *arr = floor((*arr) / radius);
    arr++;
  }
}




/*******************************************************************/


CrossCount::CrossCount():dw0(NULL),
										 dw1(NULL),
                     in_memory(false),
                     two_table(false),
                     base(2),
                     radius_min(0.00000095367431640625), /* 2^-20 */
                     radius_max(262144),                 /* 2^18  */
                     occupancy_max(0.95),
                     singleton_max(0.95),
                     radius_count(39) {
  log_div = log(base);

#if (USE_BERKELEY == 1)
  db_type = DB_Berkeley;
#elif (USE_EXTHASH == 1)
  /* Berkeley DB has undergone more testing, and supports on-disk
   * databases, so it takes priority as a default.
   */
  db_type = DB_ExtHash;
#endif /* USE_BERKELEY */
}


CrossCount::~CrossCount() {
  /* No need for anything here, really. */
}




/* Which cross-counting shall we use?  The standard version --
 * iterate over the data every time -- or a stranger one in
 * which we can iterate over occupied cells?
 */
int CrossCount::compute(unsigned int& point_count,
                      double *&log_radius_array,
                      double *&log_count_array) {

  double radius_mult = 0;

  /* Some basic sanity checking. */
  if ((!dw0) || (!dw1) || (base <= 0) ||
			(radius_min >= radius_max) ||
      (radius_count < 1)   || (occupancy_max < 0)        ||
      (occupancy_max > 1)  || (singleton_max < 0)        ||
      (singleton_max > 1)) {
    errno = EINVAL;
    return -1;
  }

  switch(db_type) {
  case DB_Berkeley:
#if (USE_BERKELEY == 1)
    break;
#else
    errno = EINVAL;
    return -1;
#endif
  case DB_ExtHash:
#if (USE_EXTHASH == 1)
    break;
#else
    errno = EINVAL;
    return -1;
#endif
  default:
    errno = EINVAL;
    return -1;
  }

  radius_mult = exp(log(radius_max / radius_min) /
                    (radius_count - 1));

#ifdef PARANOID
  if (radius_mult != floor(radius_mult)) {
    cerr << "Non-integer radius multiplier of " << radius_mult << " is " << endl;
    cerr << "forbidden when the cross-counting library is built with" << endl;
    cerr << "-DPARANOID." << endl;
    errno = EINVAL;
    return -1;
  }
#endif /* PARANOID */

  if ((two_table) && (radius_mult == floor(radius_mult))) {
    return compute_all_crosses_two_table(point_count, log_radius_array,
                           log_count_array, radius_mult);
  }

  return compute_all_crosses(point_count, log_radius_array,
                           log_count_array, radius_mult);
}



/* The standard approach. */
int CrossCount::compute_all_crosses(unsigned int& point_count,
                      double *&log_radius_array,
                      double *&log_count_array,
                      double   radius_mult) {
  BC_dbl_ll_t *head = NULL;
  BC_dbl_ll_t *tail = NULL;
  BC_dbl_ll_t *ptr  = NULL;
  double         radius = 0;
  double         s_frac = 0;
  double         o_frac = 0;
  unsigned int   index  = 0;
  double         count  = 0;

#ifdef PARANOID
  /* If it's paranoid mode, this is ALWAYS true because we
   * checked earlier and complained (returned error) if it
   * wasn't.
   */
  static const bool mult_is_int = true;
#else
  bool           mult_is_int = (radius_mult == floor(radius_mult));
#endif /* PARANOID */
  point_count = 0;


  /* First, try the middle radius. */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index);
  assert(!count_single_radius(radius, count, s_frac, o_frac));

	if (count > 0) {
		if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
			add_ll_head(head, tail, log(radius) / log_div,
									(log(count) / log_div));
			point_count++;
		}
	}


  /* Reduce the radius until ending condition reached.. */
  while ((count > 0) && (s_frac <= singleton_max) &&
         (radius > radius_min)) {
    radius /= radius_mult;

    assert(!count_single_radius(radius, count, s_frac, o_frac));

    if ((s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
      add_ll_head(head, tail, log(radius) / log_div,
									(log(count) / log_div));
      point_count++;
    }


    if ((mult_is_int) && (s_frac == 1)) {
      /* If every object is in its own cell, then reducing
       * the radius further is pointless -- at least, if
       * the radius multiplier is an integer.
       */
      break;
    }
  }

  /* Reset to middle * radius_mult... */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index+1);

  if (radius <= radius_max) {
    assert(!count_single_radius(radius, count, s_frac, o_frac));

    if ((count > 0) && (s_frac <= singleton_max) && 
				(o_frac <= occupancy_max)) {
      add_ll_tail(head, tail, log(radius) / log_div,
									(log(count) / log_div));
      point_count++;
    }

    /* Increasing the radius until ending condition reached.. */
    while ((o_frac <= occupancy_max) &&
           (radius < radius_max)) {
      radius *= radius_mult;

      assert(!count_single_radius(radius, count, s_frac, o_frac));

      if ((count > 0) && 
					(s_frac <= singleton_max) && (o_frac <= occupancy_max)) {
        add_ll_tail(head, tail, log(radius) / log_div,
										(log(count) / log_div));
        point_count++;
      }


      if ((mult_is_int) && (o_frac == 1)) {
        /* If every object is in the same cell, then reducing
         * the radius further is pointless, if monotonicity
         * guarantees exist.
         */
        break;
      }
    }
  }

  /* Allocate and fill arrays. */
  log_radius_array = new double[point_count];
  log_count_array  = new double[point_count];
  assert(log_radius_array);
  assert(log_count_array);

  ptr = head;

  for (index=0; index < point_count; index++) {
    log_radius_array[index] = ptr->log_radius;
    log_count_array[index]  = ptr->log_count;

    ptr = ptr->next;
    delete head;
    head = ptr;
  }

  return 0;
}




/* The two_table approach. */
int CrossCount::compute_all_crosses_two_table(unsigned int& point_count,
                      double *&log_radius_array,
                      double *&log_count_array,
                      double   radius_mult) {
  DBLayer     *prev_table0 = NULL;
	DBLayer     *prev_table1 = NULL;
  BC_dbl_ll_t *head = NULL;
  BC_dbl_ll_t *tail = NULL;
  BC_dbl_ll_t *ptr  = NULL;
  double         radius = 0;
  double         s_frac = 0;
  double         o_frac = 0;
  unsigned int   index  = 0;
  double         count  = 0;
#ifdef PARANOID
  /* If it's paranoid mode, this is ALWAYS true because we
   * checked earlier and complained (returned error) if it
   * wasn't.
   */
  static const bool mult_is_int = true;
#else
  bool           mult_is_int = (radius_mult == floor(radius_mult));
#endif /* PARANOID */
  point_count = 0;


  /* First, try the middle radius. */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index);
  assert(!count_single_radius(radius, count, s_frac, o_frac));

  if ((count > 0) && 	(s_frac <= singleton_max) && 
			(o_frac <= occupancy_max)) {
    add_ll_head(head, tail, log(radius) / log_div,
								(log(count) / log_div));
    point_count++;
  }


  /* Reduce the radius until ending condition reached.. */
  while ((count > 0) &&
				 (s_frac <= singleton_max) &&
         (radius > radius_min)) {
    radius /= radius_mult;

    assert(!count_single_radius(radius, count, s_frac, o_frac));

    if ((count > 0) && (s_frac <= singleton_max) && 
				(o_frac <= occupancy_max)) {
      add_ll_head(head, tail, log(radius) / log_div,
									(log(count) / log_div));
      point_count++;
    }


    if ((mult_is_int) && (s_frac == 1)) {
      /* If every object is in its own cell, then reducing
       * the radius further is pointless -- at least, if
       * the radius multiplier is an integer.
       */
      break;
    }
  }

  /* Reset to middle * radius_mult... THIS part differs. */
  index  = (unsigned int) floor(radius_count / 2);
  radius = radius_min * pow(radius_mult, index+1);

  if (radius <= radius_max) {
    assert(!count_single_radius_two_table(radius, radius_mult,
                                          count, s_frac, o_frac,
                                          prev_table0,
																					prev_table1));
    /* Since there was no previous table, csr2t computed a table
     * with the spec'd radius.  The count, s_frac, and o_frac
     * values correspond to THAT table.  prev_table contains a
     * table with radius*radius_mult.
     */
    if ((count > 0) && (s_frac <= singleton_max) && 
				(o_frac <= occupancy_max)) {
      add_ll_tail(head, tail, log(radius) / log_div,
									(log(count) / log_div));
      point_count++;
    }

    /* Increasing the radius until ending condition reached.. */
    while ((o_frac <= occupancy_max) &&
           (radius < radius_max)) {
      radius *= radius_mult;

      assert(!count_single_radius_two_table(radius, radius_mult,
                                            count, s_frac, o_frac,
                                            prev_table0,
																						prev_table1));

      if ((count > 0) && (s_frac <= singleton_max) && 
					(o_frac <= occupancy_max)) {
        add_ll_tail(head, tail, log(radius) / log_div,
										(log(count) / log_div));
        point_count++;
      }


      if ((mult_is_int) && (o_frac == 1)) {
        /* If every object is in the same cell, then reducing
         * the radius further is pointless, if monotonicity
         * guarantees exist.
         */
        break;
      }
    }
  }

  /* Allocate and fill arrays. */
  log_radius_array = new double[point_count];
  log_count_array  = new double[point_count];
  assert(log_radius_array);
  assert(log_count_array);

  ptr = head;

  for (index=0; index < point_count; index++) {
    log_radius_array[index] = ptr->log_radius;
    log_count_array[index]  = ptr->log_count;
    ptr = ptr->next;
    delete head;
    head = ptr;
  }

  /* Delete old table. */
  if (prev_table0) {
    delete prev_table0;
  }

  if (prev_table1) {
    delete prev_table1;
  }

  return 0;
}



/* A workhorse. */
int CrossCount::count_single_radius(double         radius,
                                  double        &count,
                                  double        &singleton_frac,
                                  double        &occupancy_frac) {
  DBLayer      *count_db0   = NULL;
	DBLayer      *count_db1   = NULL;
  double       *vec         = NULL;
  double       *vec_key     = NULL;
	double        singfrac0   = 0;
	double        singfrac1   = 0;
	double        occfrac0    = 0;
	double        occfrac1    = 0;

  unsigned int  index       = 0;
	unsigned int  occ0        = 0;
  unsigned int  occ1        = 0;
  unsigned int  occ_max0    = 0;
  unsigned int  occ_max1    = 0;
  unsigned int  single0     = 0;
	unsigned int  single1     = 0;
	unsigned int  cell_count  = 0;

  count    = 0;

  switch(db_type) {
#if (USE_BERKELEY == 1)
  case DB_Berkeley:
    count_db0 = new BerkeleyLayer(data_dims0, in_memory);
    count_db1 = new BerkeleyLayer(data_dims1, in_memory);
    break;
#endif

#if (USE_EXTHASH == 1)
  case DB_ExtHash:
    count_db0 = new ExtHash(data_dims0, in_memory);
    count_db1 = new ExtHash(data_dims1, in_memory);
    break;
#endif

  default:
    assert(0);
  }

  assert(count_db0);
  assert(count_db1);

  /* sequential scan on first data source */
  for (index=0; index < data_count0; index++) {
    vec = dw0->get_vector(index);

    divide_by_radius_and_floor(vec, data_dims0, radius);

		cell_count = count_db0->fetch_plusplus(vec);
    if (cell_count == 0) {
      /* errno should already be set to a sane value by
       * the database package
       */
      delete count_db0;
      return -1;
    } else if (cell_count == 1) {
			/* First vector in this cell, in this pass. */
			occ0++;
			single0++;
		} else if (cell_count == 2) {
			/* Second.  It's no longer a singleton. */
			single0--;
		}

		if (cell_count > occ_max0) {
			occ_max0 = cell_count;
		}
  }


  /* sequential scan on second data source */
  for (index=0; index < data_count1; index++) {
    vec = dw1->get_vector(index);

    divide_by_radius_and_floor(vec, data_dims1, radius);

		cell_count = count_db1->fetch_plusplus(vec);
    if (cell_count == 0) {
      /* errno should already be set to a sane value by
       * the database package
       */
      delete count_db1;
      return -1;
    } else if (cell_count == 1) {
			/* First vector in this cell, in this pass. */
			occ1++;
			single1++;
		} else if (cell_count == 2) {
			/* Second.  It's no longer a singleton. */
			single1--;
		}

		if (cell_count > occ_max1) {
			occ_max1 = cell_count;
		}
  }



	/* Compute statistics for thresholds. */

	singfrac0 = ((double) single0)  / occ0;
	singfrac1 = ((double) single1)  / occ1;
	occfrac0  = ((double) occ_max0) / data_count0;
	occfrac1  = ((double) occ_max1) / data_count1;


	/* We'll return the maximum fractions, so that if either case
	 * triggers thresholds we'll stop immediately.
	 */
	singleton_frac = MAX2(singfrac0, singfrac1);
	occupancy_frac = MAX2(occfrac0, occfrac1);


  /* All data has been processed.  Each vector has incremented exactly
	 * one cell in the corresponding table.  In this part, we'll iterate
	 * over the smaller database using the cursor and look up
	 * corresponding values in the larger.
   */

	if (occ0 > occ1) {
		/* The second table is smaller.  Swap them. */
		DBLayer* pdb_temp = count_db0;

		count_db0 = count_db1;
		count_db1 = pdb_temp;
	}


	/* Buffer space. */
  vec_key = new double[data_dims0];
  while (!(count_db0->cursor_next(vec_key, cell_count))) {
		unsigned int other_count = 0;

    /* Update occupancy statistics */
    assert(cell_count > 0);


		/* Check in corresponding table. */
		other_count = count_db1->fetch(vec_key);

		if (other_count > 0) {
			count += (cell_count * other_count);
			/* overflow check */
			assert(count > 0);
		}
	}


  delete [] vec_key;
  delete count_db0;
  delete count_db1;

  return 0;
}




/* Compute occupancies for the next table while calculating
 * stats for the previous one.
 */
int CrossCount::count_single_radius_two_table(double         radius,
                                  double         radius_mult,
                                  double        &count,
                                  double        &singleton_frac,
                                  double        &occupancy_frac,
                                  DBLayer       *&prev_table0,
																	DBLayer       *&prev_table1) {
  DBLayer      *new_db0    = NULL;  /* next table */
	DBLayer      *new_db1    = NULL;  /* Ditto.     */

  double       *vec        = NULL;
  double       *vec_key     = NULL;
	double        singfrac0   = 0;
	double        singfrac1   = 0;
	double        occfrac0    = 0;
	double        occfrac1    = 0;

  unsigned int  index       = 0;
	unsigned int  occ0        = 0;
  unsigned int  occ1        = 0;
  unsigned int  occ_max0    = 0;
  unsigned int  occ_max1    = 0;
  unsigned int  single0     = 0;
	unsigned int  single1     = 0;
	unsigned int  cell_count  = 0;
	unsigned int  old_count   = 0;



  count    = 0;
  switch(db_type) {

#if (USE_BERKELEY == 1)
  case DB_Berkeley:
    new_db0 = new BerkeleyLayer(data_dims0, in_memory);
		new_db1 = new BerkeleyLayer(data_dims1, in_memory);
    break;
#endif

#if (USE_EXTHASH == 1)
  case DB_ExtHash:
    new_db0 = new ExtHash(data_dims0, in_memory);
		new_db1 = new ExtHash(data_dims1, in_memory);
    break;
#endif

  default:
    assert(0);
  }
  assert(new_db0);
  assert(new_db1);


  if (!prev_table0) {
		assert(!prev_table1);

    /* We don't have previous tables, so we need to generate it. */
		switch(db_type) {

#if (USE_BERKELEY == 1)
		case DB_Berkeley:
			prev_table0 = new BerkeleyLayer(data_dims0, in_memory);
			prev_table1 = new BerkeleyLayer(data_dims1, in_memory);
			break;
#endif

#if (USE_EXTHASH == 1)
		case DB_ExtHash:
			prev_table0 = new ExtHash(data_dims0, in_memory);
			prev_table1 = new ExtHash(data_dims1, in_memory);
			break;
#endif
		default:
			assert(0);
		}


		assert(prev_table0);
		assert(prev_table1);


		for (index=0; index < data_count0; index++) {
			vec = dw0->get_vector(index);

			divide_by_radius_and_floor(vec, data_dims0, radius);
			cell_count = prev_table0->fetch_plusplus(vec);

			if (!cell_count) {
				/* errno should already be set to a sane value by
				 * the database package
				 */
				delete prev_table0;
				delete prev_table1;

				prev_table0 = NULL;
				prev_table1 = NULL;
				return -1;
			}
		}

		for (index=0; index < data_count1; index++) {
			vec = dw1->get_vector(index);

			divide_by_radius_and_floor(vec, data_dims1, radius);
			cell_count = prev_table1->fetch_plusplus(vec);

			if (!cell_count) {
				/* errno should already be set to a sane value by
				 * the database package
				 */
				delete prev_table0;
				delete prev_table1;

				prev_table0 = NULL;
				prev_table1 = NULL;
				return -1;
			}
		}
  }



  /* The 'previous' tables have been seeded.  Iterate over them to
	 * compute the cross-product, as well as seed the new tables.
	 * Note that we need to iterate over both tables for seeding,
	 * because cells that have non-zero cross-products with a larger
	 * resolution do not necessarily translate into cells with
	 * non-zero cross-products with a smaller resolution.
	 *
	 *   __ __
	 *  |  |  |  If one set has non-zero count in A, and another in
	 *  |__|A_|  B, then their cross-product in each individual cell
	 *  | B|  |  is still zero, but it won't be when all four cells
	 *  |__|__|  are combined.  Ergo, we need two passes and cannot
	 *
	 * take the superficially plausible short-cut.
	 *
	 * We also need to compute occupancy and singleton statistics
	 * during these passes.
   */


	/* Buffer space. */
  vec_key = new double[data_dims0];

  while (!(prev_table0->cursor_next(vec_key, cell_count))) {
		unsigned int other_count = 0;

    /* Update occupancy statistics */
    assert(cell_count > 0);

		occ0++;
		if (cell_count == 1) {
			single0++;
		}

		if (cell_count > occ_max0) {
			occ_max0 = cell_count;
		}

		/* Check in corresponding table. */
		other_count = prev_table1->fetch(vec_key);

		if (other_count > 0) {
			count += (cell_count * other_count);
			/* overflow check */
			assert(count > 0);
		}

		/* Add to the correct cell in new_db. */
		divide_by_radius_and_floor(vec_key, data_dims0, radius_mult);
		old_count = new_db0->fetch(vec_key);
		cell_count += old_count;
		assert((new_db0->store(vec_key, cell_count)) == cell_count);
	}



  while (!(prev_table1->cursor_next(vec_key, cell_count))) {
    /* Update occupancy statistics */
    assert(cell_count > 0);

		occ1++;
		if (cell_count == 1) {
			single1++;
		}

		if (cell_count > occ_max1) {
			occ_max1 = cell_count;
		}

		/* Add to the correct cell in new_db. */
		divide_by_radius_and_floor(vec_key, data_dims1, radius_mult);
		old_count = new_db1->fetch(vec_key);
		cell_count += old_count;
		assert((new_db1->store(vec_key, cell_count)) == cell_count);
	}


	/* Compute statistics for thresholds. */
	singfrac0 = ((double) single0)  / occ0;
	singfrac1 = ((double) single1)  / occ1;
	occfrac0  = ((double) occ_max0) / data_count0;
	occfrac1  = ((double) occ_max1) / data_count1;


	/* We'll return the maximum fractions, so that if either case
	 * triggers thresholds we'll stop immediately.
	 */
	singleton_frac = MAX2(singfrac0, singfrac1);
	occupancy_frac = MAX2(occfrac0, occfrac1);

  delete [] vec_key;

	delete prev_table0;
	delete prev_table1;

	prev_table0 = new_db0;
	prev_table1 = new_db1;

  return 0;
}
