/* Copyright (c) 1993 by The Johns Hopkins University */



/* READERS.C:  Routines for reading input data for a variety of formats */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "config.h"
#include "pebls.h"

extern instance_type data[INSTANCES_MAX];
extern config_type CONFIG;





/* ------------------------------------------------------------ */
/* DISCRETIZE:  Convert a continuous value into a discrete      */
/*    (symbolic) value based upon the specified number of classes */

int discretize(int feature, char *symbol)
{
    float value,min,max,discretes;
    int dvalue;

    if (CONFIG.common_values==TRUE) feature = 0;
    value = atof(symbol);
    min = CONFIG.cont[feature].min;
    max = CONFIG.cont[feature].max;
    discretes = (float) CONFIG.cont[feature].discretes;

    if (value == max) dvalue = CONFIG.cont[feature].discretes - 1;
    else dvalue = (int) (discretes * (value - min) / (max - min));

    return (dvalue);
}




/* ------------------------------------------------------------ */
/* STANDARD_READER:  Loads data from file in "STANDARD" format  */
/*   <Class-Name>,  <Instance-Id>,  <feature1> <feature2> ....  */

void standard_reader(void)
{
    char delim[5];
    char line[LINE_MAX];
    char value[2];
    char *symbol;
    FILE *fptr;
    int  icount = -1;
    int  train_count = 0, test_count = 0;
    int  fcount;
    int  scount;
    int  i, len;
    int  training = FALSE, testing = FALSE;


    strcpy(delim, " ,\t\n");
    if ((fptr = fopen(CONFIG.data_file, "r")) == NULL)
      error(DATAFILE_ERR, NULL);

    while (fgets(line, LINE_MAX, fptr) != NULL)
    {
      if (line[0] != '#')
      {
	scount = 0; fcount = 0;
	symbol = strtok(line, delim);
	if (symbol != NULL) 
	{
	    if (strcasecmp(symbol,"TRAIN")==0)
		training = TRUE;
	    else if (strcasecmp(symbol,"TEST")==0)
	    {
		training = FALSE;
		testing = TRUE;
	    }
	    else if (symbol) 
	    {
		icount++;
		if (training) train_count++;
		else if (testing)  test_count++;
	    }
	}


	while ((symbol != NULL) && 
	       (strcasecmp(symbol,"TRAIN") != 0) &&
	       (strcasecmp(symbol,"TEST") != 0))
	{
	    if (scount==0) data[icount].class_true = classtab_lookup(symbol);
	    else if (scount==1) strcpy(data[icount].id, symbol);
	    else
	    {
	        if (CONFIG.value_spacing)
		{
		                       
		  if ((CONFIG.cont[fcount].enable == TRUE)  ||
		      ((CONFIG.common_values == TRUE) &&
		       (CONFIG.cont[0].enable == TRUE)))
		    data[icount].value[fcount++] = discretize(fcount, symbol);
		  else data[icount].value[fcount++] = symtab_lookup(fcount, symbol);
		}
	        else    
 	        {
	            len = strlen(symbol);
	            value[1] = '\0';
	            for (i=0; i<len; i++)
	            {
		        value[0] = *(symbol+i);
		        data[icount].value[fcount++] = 
				symtab_lookup(fcount, value);
		    }
	        }
	    } 
	    symbol = strtok(NULL, delim);
	    scount++;
	}
      }
    }
    
    CONFIG.instances = icount + 1;
    CONFIG.training_instances = train_count;
    CONFIG.test_instances = test_count;
    fclose(fptr);
}







/* ------------------------------------------------------------ */
/* SUBUNIT_TO_INSTANCES: Convert a segment to a sequence of     */
/* instances.  							*/
/* INPUTS:  subunit = the subunit				*/
/*	    length =  length of the segment			*/
/*	    training = flag indicating train or test instance   */


void subunit_to_instances(subunit_type *subunit, int length, int training)
{
    int i,j,k;
    int instances, training_instances, test_instances;
    int imin, imax, diff;

    instances = CONFIG.instances;
    training_instances = CONFIG.training_instances;
    test_instances = CONFIG.test_instances;

    diff = (CONFIG.features-1) / 2;

    for (i=0; i<length; i++)
    {
	    if (instances > INSTANCES_MAX - 1) error(INSTANCES_ERR, NULL);
	    strcpy(data[instances].id, subunit->id);
	    data[instances].class_true = subunit->class[i];
	    data[instances].offset = i;
	    for (j=i-diff,k=0; j<=i+diff; j++, k++)
	    {
		if ((j<0) || (j>=length))
		    data[instances].value[k] = 0;
		else data[instances].value[k] = subunit->value[j];
	    }
	    instances++;
	    if (training) training_instances++;
	    else test_instances++;
    }

    CONFIG.instances = instances;
    CONFIG.training_instances = training_instances;
    CONFIG.test_instances = test_instances;
}




/* ------------------------------------------------------------ */
/* SUBUNIT_READER:  Loads data from file in "SUBUNITS" format*/
/*  (See Documentation)						*/

void subunit_reader(void)
{
    FILE *fptr;
    char line[LINE_MAX];
    char delim[5];
    char *token1, *token2;
    int length;
    subunit_type subunit;
    int  training;
    int i;

    strcpy(delim, " \t\n,");

    if ((fptr = fopen(CONFIG.data_file, "r")) == NULL)
      error(DATAFILE_ERR, NULL);

    while (fgets(line, LINE_MAX, fptr) != NULL)
    {
	
	token1 = strtok(line, delim);
	if (token1) 
	{
	    token2=strtok(NULL, delim);
	    if (*token1 == '#') /* do nothing */ ;
	    else if (strcasecmp(token1, "BEGIN")==0)
	    {
		if (token2) strcpy(subunit.id, token2);
		else strcpy(subunit.id,"");
		length = 0;
	    }

	    else if (strcasecmp(token1, "END")==0)
		subunit_to_instances(&subunit, length, training);

	    else if (strcasecmp(token1, "TRAIN")==0)
		training = TRUE;

	    else if (strcasecmp(token1, "TEST")==0)
		training = FALSE;

	    else
	    {
		subunit.value[length] = symtab_lookup(length, token1);
		subunit.class[length] = classtab_lookup(token2);
		length++;
	    }
	}
    }
    fclose(fptr);
}




