/* File:     BoxCount.h
 * Purpose:  To provide an API for computing pair counts,
 *           and for setting / getting related parameters.
 *
 * RCS:
 ************************************************************
 * $Id: BoxCount.h,v 1.4 2001/09/03 19:36:11 lw2j Exp $
 * $Log:	BoxCount.h,v $
 * Revision 1.4  2001/09/03  19:36:11  lw2j
 * Now knows about different DBLayer implementations.
 * 
 * Revision 1.3  2001/08/22  17:20:18  lw2j
 * Untabified.
 * 
 * Revision 1.2  2001/08/22  16:50:51  lw2j
 * Added support for two_tables.
 * 
 * Revision 1.1  2001/08/22  04:27:01  lw2j
 * Initial revision
 * 
 *
 *
 ************************************************************
 */


#ifndef BOXCOUNT_H
#define BOXCOUNT_H

/* I/O is expected to follow the DataWrapper virtual class.
 * For a non-seekable source such as a stream, or for small
 * data, MemoryWrapper might be either necessary (for the
 * former) or faster (for the latter).  For data that is too
 * large to easily fit in memory, DiskWrapper may be 
 * necessary.  BoxCount will restrict itself to the 
 * DataWrapper generic API in any case.
 *
 * This class will neither allocate nor deallocate the
 * wrapper -- that's the user's job.
 */

#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include "DBLayer.h"
#include "DataWrapper.h"

class BoxCount {
 public:
  BoxCount();
  ~BoxCount();

  /* Which database layer to use?  Select one of the following. */
  static const unsigned int DB_Berkeley = 0;
  static const unsigned int DB_ExtHash  = 1;

  /* accessors and mutators */
  DataWrapper *get_wrapper(void) const { return dw; };
  bool   get_in_memory(void) const { return in_memory; };
  bool   get_two_table(void) const { return two_table; };
  double get_base(void) const { return base; };
  double get_exponent(void) const { return exponent; };
  double get_radius_min(void) const { return radius_min; };
  double get_radius_max(void) const { return radius_max; };
  double get_occupancy_max(void) const { return occupancy_max; };
  double get_singleton_max(void) const { return singleton_max; };
  unsigned int get_radius_count(void) const { return radius_count; };
  unsigned int get_db_type(void) const { return db_type; };

  void set_wrapper(DataWrapper *dw_new) { 
    dw = dw_new;
    assert(dw);  /* why use a NULL? */
    data_dims  = dw->get_dims();
    data_count = dw->get_count();
  };

  void set_in_memory(bool in_memory_new) { in_memory = in_memory_new; };
  void set_two_table(bool two_table_new) { two_table = two_table_new; };
  void set_base(double base_new) {base = base_new; log_div = log(base); };
  void set_exponent(double exponent_new) {exponent = exponent_new;};
  void set_radius_min(double radius_min_new) {radius_min = radius_min_new;};
  void set_radius_max(double radius_max_new) {radius_max = radius_max_new;};
  void set_occupancy_max(double occupancy_max_new) 
    { occupancy_max = occupancy_max_new;};
  void set_singleton_max(double singleton_max_new) 
    { singleton_max = singleton_max_new;};
  void set_radius_count(unsigned int radius_count_new) 
    { radius_count = radius_count_new; };
  void set_db_type(unsigned int db_type_new) 
    { db_type = db_type_new; };

  /* Given the above parameters, if nothing goes wrong --
   *
   * - Fill in point_count with the number of log-log points actually
   *   generated.  This may be substantially less than radius_count.
   * - Fill in log_radius_array with a new()'d pointer to enough
   *   heap space containing exactly that many radii, each having
   *   already been subjected to log_base.
   * - Fill in log_count_array, again with a new()'d pointer, to
   *   counts that have been subjected to log().
   *
   * And return 0.
   *
   * On failure, return -1, and try to set errno appropriately.
   */
  int compute(unsigned int& point_count, 
              double *&log_radius_array,
              double *&log_count_array);
  
 protected:
  /* methods */
  
  /* The standard approach:  each iteration takes a full pass 
   * over the wrapper, and ignores previously computed grids.
   */
  int compute_all_boxes(unsigned int& point_count, 
              double *&log_radius_array,
              double *&log_count_array,
              double   radius_mult);

  /* The two_table approach, which differs when increasing the
   * radius -- we can build off the previous table.
   */
  int compute_all_boxes_two_table(unsigned int& point_count, 
              double *&log_radius_array,
              double *&log_count_array,
              double   radius_mult);


  /* Compute statistics for just one radius, discarding the
   * individual counts later.  The count has not yet been
   * subjected to the logarithm, except for exponent=1.
   *
   * Return 0 on success, -1 && set errno otherwise.
   */
  int count_single_radius(double         radius, 
                          double        &count,
                          double        &singleton_frac,
                          double        &occupancy_frac);


  /* If prev_table is not NULL, use the stored occupancies; 
   * otherwise, compute afresh from the data wrapper.
   *
   * Count and fractions are returned for these occupancies.
   *
   * prev_table is then updated to be a pointer to a new()'d 
   * object which stores the occupancies for the *next*
   * radius, and the old table handed over to delete().
   *
   * Return 0 on success, -1 && set errno otherwise.
   */
  int count_single_radius_two_table
    (double         radius, 
     double         radius_mult,
     double        &count,
     double        &singleton_frac,
     double        &occupancy_frac,
     DBLayer       *&prev_table);

  /* data */
  DataWrapper *dw;

  bool    in_memory;             /* Store counters in memory? [ FALSE ]
                                  *
                                  * WARNING:  This can take a LOT of
                                  * memory, since there may be many
                                  * database cells with large keys.
                                  */
  bool    two_table;             /* Use the 'two table' algorithm? [FALSE]
                                  * 
                                  * When increasing the radius by an
                                  * integer multiplier, it's possible to
                                  * derive on table referring only to 
                                  * the previous instead of going back to
                                  * the data.  This may or may not be 
                                  * faster.
                                  *
                                  * This is ignored if the radius ratio
                                  * is NOT an integer, even with PARANOID
                                  * off.
                                  */
  double  base;                  /* Base for logarithms.  Merely a 
                                  * scaling factor, really.  [2]
                                  */
  double  exponent;              /* 0=Hausdorff fractal dimension, 
                                  * 1=information fractal dimension,
                                  * 2=correlation fractal dimension
                                  * [2]
                                  */
  double  radius_min;            /* Minimum radius.  [2^-20] */
  double  radius_max;            /* Maximum radius.  [2^18]  */

  double  occupancy_max;         /* Maximum fraction allowed to be 
                                  * in one cell.  [0.95] */
  double  singleton_max;         /* Maximum fraction allowed to be
                                  * singletons.  [0.95] */
  unsigned int  radius_count;    /* Maximum number of radii.  [39] */  

  /* Type of the count database -- namely, which DBLayer 
   * implementation to use.
   */
  unsigned int db_type;      

  /* No direct accessors or mutators for these two, since they're
   * derived from the wrapper. */
  unsigned int data_dims;        /* From the wrapper. */
  unsigned int data_count;       /* Likewise.         */

  /* Again, derived. */
  double  log_div;                /* log(base) */
};

#endif /* BOXCOUNT_H */
