/*
* Author:     Leejay Wu
* Started:    2001-Feb-01
* File:       wrapper.cc
* Purpose:    To provide a very basic wrapper for handling
*             simple flat-files.
*
***************************************************************
* Id:  $Id: wrapper.cc,v 1.2 2003/09/01 15:21:28 lw2j Exp $
*
* $Log:	wrapper.cc,v $
// Revision 1.2  2003/09/01  15:21:28  lw2j
// Indentified.
// 
// Revision 1.1  2003/09/01  14:57:15  lw2j
// Initial revision
//
*
*/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>


#include <assert.h>

#include "wrapper.h"



/* size of our per-line read buffer */
static const int LINEBUFSIZE = 1048576;



Wrapper::Wrapper(FILE *inf):fh(inf), dimensionality(-1), count(-1),
idx(0) {
  /* no reference to mask here because the Array constructor
  * should automatically handle it
  */

  reset();
}



Wrapper::~Wrapper(void) {
  mask.clear();
}



/* Remember that both are 0-based.
*
* idx = 0 means we're at the beginning (have not read any vectors).
* index=0 means we want the first vector.
*
*/
Array<double> Wrapper::get_object(long index, int &err) {
  /* This is going to get called a LOT, and is large enough that
  * perhaps we don't want it as a near-permanent resident of the
  * stack (and we certainly don't want to have to malloc() /
  * free() with every read).
  *
  * It wastes memory being static when we're not reading, but
  * memory is fairly cheap, and if the user is sure that no line
  * will approach LINEBUFSIZE it can be shrunk to a far more
  * sane number like 4096 or 8192.
  *
  * Because fgets adds a NULL, we really don't need to
  * initialize buf, but will anyway.
  */

  Array<double> dbl_vec;
  static char   buf[LINEBUFSIZE] = { 0 };
  err = 0;

  assert(fh);
  assert(index >= 0);

  if (idx > index) {
    reset();
  }

  if ((count <= index) && (count>=0)) {
    err = 1;
    return dbl_vec;
  }

  while (idx <= index) {
    char *s = fgets(buf, LINEBUFSIZE, fh);
    int       digit_found = 0;


    if (!s) {
      /* EOF or other error, probably means tried to
      * read out of range
      */

      if (count <= 0) {
        count = idx;
      } else {
        /* if this fails, it's NOT EOF and something's weird */
        assert(count == idx);
      }
      err = 1;
      return dbl_vec;
    }

    /* If there are any semicolons or newlines, turn the
    * first one into a NULL (terminating the string).
    * Meanwhile, turn commas into spaces, and search for
    * digits.
    */
    while (*s) {
      switch(*s) {

        case ';':case '\n':
        *s = (char) 0;
        goto after_loops;
        case '0':case '1':case '2':case '3':
        case '4':case '5':case '6':case '7':
        case '8':case '9':
        digit_found = 1;
        break;
        case ',':
        *s = ' ';
        break;
        default:
        break;
      }

      s++;
    }

    after_loops:
    if (!digit_found) {
      /* nothing on this line, read the next */
      continue;
    }

    /* something on this line */
    idx++;
  }


  /* So the actual vector is stored in buf.  Now we need
  * to extract and return it.
  */
  {
    char *field = NULL;
    int  i      = 0;


    if (dimensionality > 0) {
      /* We don't recognize 0-dimensional vectors anyway,
      * because they don't have digits, the poor things.
      *
      * But if we already know the dimensionality, we
      * should be able to save a wee bit of time by
      * resizing the array rather than letting it resize
      * on the fly possibly multiple times.
      */

      dbl_vec.resize(dimensionality);
    }

    field = buf;

    while (*field) {
      double val       = 0;
      int    args_read = 0;

      while (*field && (isspace(*field) || (*field == ','))) field++;

      if ((!(*field)) || (*field == '\n')) {
        /* nulls and newlines mean end of line */
        break;
      }

      args_read = sscanf(field, "%lf", &val);

      dbl_vec[i++] = val;


      while (*field && (!(isspace(*field) || (*field == ',')))) field++;
    }


    if (dimensionality < 0) {
      dimensionality = i;
    } else if (dimensionality != i) {
      /* warn the user? */
      fprintf(stderr, "Warning:  Dimensionality %d, vector %ld, conflicts with previous dimensionality %d.\n", i, index, dimensionality);
    }

    if (mask.getCount() > 0) {
      unsigned int ct = mask.getCount();
      unsigned int j  = 0;
      Array<double> masked_vec;

      masked_vec.resize(ct);

      for (j=0; j < ct; j++) {
        int attr = mask[j];

        if ((attr < 0) || (attr >= dimensionality)) {
          fprintf(stderr, "Warning:  Dimensionality %d, attribute requested by mask %d\n", dimensionality, attr);
        } else {
          masked_vec[j] = dbl_vec[attr];
        }
      }

      dbl_vec = masked_vec;
    }
  }

  return dbl_vec;
}


/* For those who don't really care about what vector they get,
* they just want to cycle through.
*/
Array<double> Wrapper::get_next_object(long &index, int &err) {
  Array<double> vec;

  err = 0;

  if ((count >= 0) && (idx >= count)) {
    err = -1;
    return vec;
  }

  index=idx;

  vec=get_object(index, err);
  return vec;
}


/* If we found it already, return it.  Otherwise, we should
* ftell(), read a vector, get the dimensionality, and
* fseek() back.
*/
int Wrapper::get_dimensionality(void) {
  if (!fh) {
    return -1;
  }

  if (dimensionality > 0) {
    return dimensionality;
  } else {
    long current_fpos = ftell(fh);
    long current_idx  = idx;

    long ignored_idx  = 0;
    int  error_status = 0;
    Array<double>  sample_vec;

    assert(current_fpos != -1);

    /* there should be at least ONE vector ready at the
    * beginning -- we go there in case we're just at the
    * end
    */
    reset();
    sample_vec = get_next_object(ignored_idx, error_status);


    assert(!error_status);

    /* return to original position */
    assert(!fseek(fh, current_fpos, SEEK_SET));
    idx = current_idx;

    return sample_vec.getCount();
  }

  assert(0);  /* can't get here, solely for compiler */
  return -1;
}



/* Count the vectors, if we haven't already.  Ouch. */
long Wrapper::get_count(void) {
  if (!fh) {
    return -1;
  } else if (count >= 0) {
    return count;
  } else {
    int   error_con = 0;
    long  cur_idx   = 0;
    reset();

    /* uh-oh */
    do {
      /* we don't care what the return value actually is,
      * we just want to count the vectors
      */
      get_next_object(cur_idx, error_con);
    } while (!error_con);

    reset();

    /* count should have been set upon hitting EOF */
    assert(count >= 0);

    return count;
  }
}


