/* File:     datawrapper.h
 * Purpose:  To supply a generic interface for a flat-file 
 *           handler.  Most methods are meant to be 
 *           overridden, and are mandatory.
 *
 *           Subclasses should handle text loosely:
 *              - Treat commas as whitespace.
 *              - Any amount of whitespace is allowed as a separator.
 *              - Blank lines are ignored.
 *              - A semicolon indicates comment to end-of-line.
 *           To do so, they're encouraged to make use of 
 *           process_line.
 *
 *           Input is supposed to be a FILE*, which may be
 *           STDIN.
 *
 * RCS:
 ************************************************************
 * $Id: DataWrapper.h,v 1.3 2001/08/23 16:19:14 lw2j Exp $
 * $Log:	DataWrapper.h,v $
 * Revision 1.3  2001/08/23  16:19:14  lw2j
 * Added the zero_translate option.
 * 
 * Revision 1.2  2001/08/22  17:20:18  lw2j
 * Untabified.
 * 
 * Revision 1.1  2001/08/21  03:40:34  lw2j
 * Initial revision
 * 
 *
 ************************************************************
 */

#ifndef DATAWRAPPER_H
#define DATAWRAPPER_H

#include <stdio.h>
#include <stdlib.h>


class DataWrapper {
 public:
  DataWrapper(FILE* fh_in=NULL) { 
    if (fh_in) load_file(fh_in); 
  };

  virtual ~DataWrapper() {};

  /* Don't use lines longer than this value, in bytes. */
  static const unsigned int MAX_LINE_SIZE = 131072;

  
  /* Given a proper '\0'-terminated string --
   * Truncate, via '\0', at the first semicolon if any.
   * Change any non-truncated commas or whitespace --
   * as defined by 'isspace' -- to blanks.
   * Truncate any ending whitespace.
   *
   * Sufficiently complex that it's likely not worth
   * inlining here.
   *
   * Returns immediately if given a NULL pointer.
   */
  static void process_string(char *instr);

  
  /* Given a proper '\0'-terminated string that's 
   * already been filtered as desired, break it
   * down into doubles.  If the parameter 'count' is
   * specified, try to find exactly that many; if it's
   * not (or the user specified 0), find as many as we
   * can (slower).  If the double* pointer is NULL, we
   * ourselves allocate it using new.
   *
   * The return value is the number of doubles actually
   * read.  Note that if you specify expected but not
   * dbl_array, enough space is allocated to hold 
   * that many doubles even if we don't find any at all.
   * Another quirk:  if NO doubles are found, dbl_array
   * is unchanged and 0 is returned.
   *
   * The dbl_array parameter is treated as NULL if the 
   * expected value is zero or unspecified, to reduce 
   * the risk of seg faults.  If you specify both, it
   * is assumed that you've actually allocated enough
   * space.
   */
  static unsigned int find_doubles(double *&dbl_array,
           const char *instr, unsigned int expected = 0);



  /**********************************
   *   OVERRIDE ALL METHODS BELOW   *
   **********************************/


  /* Return 0 on failure, non-zero on success.  This 
   * must be done once, to verify dimensionality, 
   * cardinality and ability to parse.  In addition,
   * memory-resident versions may wish to load the
   * entire file into RAM here.
   */
  virtual int load_file(FILE *fh_in) { return 0; };



  /* Zero-translation means that vectors returned by
   * get_vector() and get_vector_dynamic() should 
   * already have been translated by subtracting the
   * least value in each attribute.
   *
   * That is, the entire set is linearlly translated
   * along each axis so that the entire set is within
   * (R+ U {0}, R+ U {0}, R+ U {0}...) -- namely, no
   * negative values are returned.
   *
   * The default, upon object initialization, should be
   * 'false'.
   */
  virtual void set_zero_translation(bool new_zero_translation) {};
  virtual bool get_zero_translation() const { return false; };


  /* These statistics should be determined when loading,
   * and hence they can be const.  Returning 0 seems to
   * be a sensible behavior to report an error 
   * condition, since 0-dimensional or 0-vector sets 
   * aren't meaningful here.
   */
  virtual unsigned int get_dims() const  { return 0; };
  virtual unsigned int get_count() const { return 0; };


  /* Using static storage.  This should be faster than
   * constant new/free, albeit not thread-safe. 
   * Index is expected to be zero-based.  
   *
   * If it's out of range, NULL is returned.
   */
  virtual double* get_vector(unsigned int index) {
    return NULL; 
  }


  /* Using new/delete heap storage; end user is 
   * responsible for deleting.  Again, index is 0-based,
   * and NULL may be returned upon an error condition.
   *
   * This is only meant to be safer in the sense
   * that the returned array shouldn't be overwritten by
   * a subsequent call.  You still shouldn't have two
   * simultaneous g_v_d's to the same DataWrapper object
   * (or two with the same file handle), because there's
   * state that'll be all FUBARed if you do.
   */
  virtual double* get_vector_dynamic(unsigned int index) {
    return NULL; 
  }
};

#endif  /* DATAWRAPPER_H */
