/*
 * VqLBG.c
 *
 * Program to generate the vq codebook using the LBG algorithm
 *
 * Author: Johan Schalkwyk at OGI, April 1993
 *
 */

/* Standard C Library include file directives */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

/* Speech library include file directives */
#include <speech.h>


/* global constant definitions */
#define TINY 1.0e-7

#define SWAPW(x)	(((x)<<8) | (0xFF & ((x)>>8)))
#define SWAPL(x)	((((x)<<24)&0xFF000000) | (((x)<<8)&0x00FF0000) | \
			 (((x)>>8) & 0x0000FF00) | (((x)>>24)&0x000000FF))

#define FEATURE ".vec"
#define CODE    ".code"

#define rand_val()      ((double) rand())/((double) RAND_MAX)


/* global variable declarations */
char   headorder, machineorder;             /* Big or Little Endian order  */
char   byteswap;                            /* swapping or not             */

float  signal_power;                        /* learning rate parameters    */

int    *count;                              /* codebook sized vectors      */
float  **previous;                          /* and matrixes                */


/* command line parameter global variables */
int  no_of_inputs;                          /* number of feature vectors   */
int  ip_dimension;                          /* dimension of feature vector */
int  numcv = 64;                            /* number of codes to generate */
int  iters = 100;                           /* number of iterations        */
int  trainflag = 0;                         /* traing start over/ conitnue */
int  rnd = 123;                             /* random number generator     */
int  out_flag=0;                            /* generate output on screen   */
int  test_flag=0;                           /* do not save to disc         */

char  FeatureFile[64] = "";                 /* feature vector file         */
char  CodeFile[64] = "";                    /* code book file name         */
char  BaseName[64] = "";                    /* language name               */


/*
 * eerror(s)
 * 
 * Error has occurred -> return error string to stderr
 *
 * s (in): pointer to error string
 *
 */

void eerror(char *s)
{
  fprintf(stderr, "Error: %s\n", s);
  exit(-1);
}


/*
 * usage()
 *
 * display usage paramters on screen
 *
 */

void usage() 
{
  fprintf(stderr, "VQ_LBG [options] {basename}\n");
  fprintf(stderr, "\n");
  fprintf(stderr, "{basename}: base name of feature vector and code book\n");
  fprintf(stderr, "\n");
  fprintf(stderr, "Where options can be any of the following:\n");
  fprintf(stderr, "\t -f name of feature vector file     [basename.vec]\n");
  fprintf(stderr, "\t -c name of code book file          [basename.code]\n");
  fprintf(stderr, "\t -n number of vectors in codebook   [%d]\n", numcv);
  fprintf(stderr, "\t -i maximum number of iterations    [%d]\n", iters);
  fprintf(stderr, "\t -s continue training               [start over]\n");
  fprintf(stderr, "\t -r random number seed              [123]\n");
  fprintf(stderr, "\t -p display error measure on screen [do not]\n");
  fprintf(stderr, "\t -t test codebook only              [train]\n");
  fprintf(stderr, "\t -h This help message\n");
  exit(1);
}


/*
 * get_comline(int argc, char **argv)
 *
 * read and interpret command line
 *
 */

void get_comline(int argc, char **argv)
{
  int c;
  extern int optind;
  extern char *optarg;

  while( (c = getopt( argc, argv, "f:c:n:i:sr:tph")) != -1 ) {
    switch( c )
      {
      case 'f': /* feature vector file name */
	strcpy(FeatureFile, optarg);
	break;
	
      case 'c': /* code book file name */
	strcpy(CodeFile, optarg);
	break;

      case 'n': /* number of vectors in codebook */
	numcv = atoi(optarg);
	break;

      case 'i': /* maximum number of iterations after splitting */
	iters = atoi(optarg);
	break;

      case 's': /* training flag */
	trainflag = 1;
	break;

      case 'r': /* random number generator */
	rnd = atoi(optarg);
	break;

      case 'p': /* display output error measure on screen */
	out_flag = 1;
	break;
	
      case 't': /* test codebook only */
	test_flag = 1;
	break;

      case 'h': /* help */
      default:
	usage();
	break;
      }
  }
  
  if ((argc - optind) != 1)
    usage();
  
  strcpy(BaseName, argv[optind]);

  /* create default options */
  if (strlen(FeatureFile)==0) {
    strcpy(FeatureFile, BaseName);
    strcat(FeatureFile, FEATURE);
  }

  if (strlen(CodeFile)==0) {
    strcpy(CodeFile, BaseName);
    strcat(CodeFile, CODE);
  }
}


/*
 * float SWAPF(inval)
 *
 * Swaps the byte ordering for floating point values from little to
 * big endian and vica versa.
 *
 * inval (in): input floating point value
 *
 * On exit returns the swapped value
 *
 */

float SWAPF (float inval)
{
  union {
    unsigned int intval;
    float floatval;
  } swap;

  swap.floatval = inval;
  swap.intval = SWAPL(swap.intval);

  return(swap.floatval);
}

/*
 * int winning_cell(input_matrix, vect, diffvect, I, numcv, ip_dimension)
 *
 * function to compute the minimum distortion of the i'th vector
 *
 * input_matrix (in): data matrix
 * vect         (in): current codebook vector
 * diffvect     (in): vector of minimum distortion
 * I            (in): index of vector to computer minimum distortion of
 * numcv        (in): number of codebook vectors
 * ip_dimension (in): dimension of each vector
 *
 */

int winning_cell(float **input_matrix, float **vect, float **diffvect, 
		 int I, int numcv, int ip_dimension)
{
  int   i,  j,  k,  w_cell;
  float *y, vx, vv, max;

  if (!(y = (float *) malloc(numcv * sizeof(float))))
    eerror("winning_cell: y --> malloc failed");

  for (i = 0; i < numcv; i++) {
    vv = 0.0;
    vx = 0.0;
    for (j = 0; j < ip_dimension; j++) {
      vv += vect[i][j] * vect[i][j];
      vx += vect[i][j] * input_matrix[I][j];
    }
    y[i] = vx - (vv / 2.0);
  }
  
  max = y[0];
  w_cell = 0;
  for (i = 1; i < numcv; i++) {
    if (y[i] > max) {
      max = y[i];
      w_cell = i;
    }
  }
  for (i = 0; i < ip_dimension; i++)  
    (*diffvect)[i] = input_matrix[I][i] - vect[w_cell][i];
  
  free((char *) y);
  
  return (w_cell);
}
 

/*
 * float ** get_data(cvfile)
 *
 * This function allocates memory and reads in the feature vector file
 * and then returns a pointer to the codebook vectors
 *
 * cvfile        (in): name of code book training vector file
 *
 */

float **get_data(char *cvfile)
{
  int   iin, il, classflag;
  float  **data;
  FILE   *fp;
  char   infile[256], class[20];
  float  buffer;
    
  if (!(fp = fopen(FeatureFile, "rb")))
    eerror("getdata: can't open input file\n");

  fscanf(fp, "%d\n", &no_of_inputs);    /* number of vectors in file     */
  fscanf(fp, "%d\n", &ip_dimension);    /* dimension of feature vector   */
  fread(&headorder, sizeof(char), 1, fp);
  machineorder = LittleIndian();
  byteswap = (((headorder) & (!machineorder)) ||
	      ((!headorder) & (machineorder)));
  
  if (!(data = (float **) Alloc2d(no_of_inputs, ip_dimension, sizeof(float))))
    eerror("get_data --> data Alloc2d failed");
  
  for (iin = 0; iin < no_of_inputs; iin++) {
    for (il = 0; il < ip_dimension; il++)	{
      if (fread(&buffer, 1, sizeof(float), fp) == EOF)
	eerror("error: Insufficient data");
      if (byteswap) 
	buffer = SWAPF(buffer);
      data[iin][il] = buffer;
    }
  }
  
  signal_power = 0.0;
  for (iin = 0; iin < no_of_inputs; iin++) {
    for (il = 0; il < ip_dimension; il++) {
      signal_power += data[iin][il] * data[iin][il];
    }
  }
    
  signal_power /= no_of_inputs;
  
  return (data);
}


/*
 * print_op(NI, cvfile, vect, numcv, ip_dimension)
 *
 * Save the optimal Vector centers for the vector quantizers --
 * Uses text format for saving the vector file.
 *
 * cvfile (in): name of the code book vector file
 * vect   (in): pointer to the codebook vectors
 * numcv  (in): number of vectors in code book
 * ip_dimension(in): dimension of each vector
 *
 */

void print_op(char *cvfile, float **vect, int numcv, int ip_dimension)
{
  int  i, j;                       /* loop counter variables        */
  FILE *cdvec;                     /* output file pipe              */
  char errmsg[100];
  
  if (!(cdvec = fopen(cvfile, "w"))) {
    sprintf(errmsg, "Can't open codebook file %s\n", cvfile);
    eerror(errmsg);
  }
  
  fprintf(cdvec, "%d\n", numcv);
  fprintf(cdvec, "%d\n", ip_dimension);
  fwrite(&machineorder, sizeof(char), 1, cdvec);
  for (i = 0; i < numcv; i++) {
    for (j = 0; j < ip_dimension; j++) {
      fwrite (&vect[i][j], sizeof(float), 1, cdvec);
    }
  }
  
  fclose(cdvec);
}


/*
 * Print_Vec(vector, dimension)
 *
 * This procedure prints a vector columns on screen, stdout
 *
 * vector    (in): pointer to vector of type short
 * dimension (in): dimensionality of vector
 *
 */

void Print_Vec(float *Vector, int dimension)
{
  int i;

  for (i=0; i< dimension; i++)
    printf("%7.4f  ", Vector[i]);

  printf("\n");
}



/*
 * float dotvec(vector1, vector2, dimension)
 *
 * This function computes the dot product between two vectors
 * of equal dimensions.
 *
 * vector1, vector2 (in): input vectors
 * dimension        (in): input dimension of vectors
 *
 */

float dotvec (float *vector1, float *vector2, int dimension)
{
  int   i;
  float sum = 0.0;

  for (i=0; i< dimension; i++)
    sum += vector1[i] * vector2[i];
  
  return (sum);
}


/*
 * Centroid(centvec, data, labelvec, numdata, dimension, label)
 *
 * This procedure returns the centroid of the labeled data
 * by averagin over all data corresponding to the label.
 *
 * centvec  (out): centroid vector pointer of type float
 * data      (in): pointer to 2D array of data vectors
 * labelvec  (in): label of data vectors
 * numdata   (in): number of data points available
 * dimension (in): dimensionality of input vectors
 * label     (in): the corresponding label to find centroid of
 *
 */

void Centroid (float *centvec, float **data, int *labelvec,
	       int numdata, int dimension, int label)
{
  int i, j;
  int numlabel = 0;

  for (j=0; j< dimension; j++)
    centvec[j] = 0.0;

  for (i=0; i<numdata; i++) {
    if (labelvec[i] == label) {
      for (j=0; j<dimension; j++) {
	centvec[j] += data[i][j];
      }
      numlabel++;
    }
  }

  if (numlabel == 0) {
    fprintf(stderr, "Splitting caused an outlier, too much splitting\n");
    exit(1);
  }
    
  for (j=0; j<dimension; j++)
    centvec[j] /= (float) numlabel;
}


/*
 * LBG_CodeBook(codebook, data, numcodes, numdata, dimension, iterations)
 *
 * The LBG_CodeBook algorithm forms the code book by iterating
 * in batch mode.
 *
 * codebook (out): pointer to matrix of vector codes
 * data      (in): data to be vector quantized
 * numcodes  (in): number of codes to generate
 * numdata   (in): number of data points in data set
 * dimension (in): dimension of code vector
 * iterations(in): number of inter splitting iterations to perform
 *
 */

void LBG_Codebook(float **codebook, float **data, int numcodes, 
		  int numdata, int dimension, int iterations)
{
  int   i, j, k;                          /* loop counter variables      */
  int   count = 0;
  int   *labels;
  float *dist, sum;
  float prev = 1000.0;

  if (!(labels = (int *) malloc(numdata * sizeof(int))))
    eerror("LBG_Codebook --> labels malloc failed");
  if (!(dist   = (float *) malloc(dimension * sizeof(float))))
    eerror("LBG_Codebook --> dist malloc failed");

  /* loop until splitting is sufficient */
  while (count<iterations) {

    /* computer winning cells */
    sum = 0.0;
    for (j=0; j<numdata; j++) {
      labels[j] = winning_cell(data, codebook, &dist, j, numcodes, dimension);
      for (k=0; k<dimension; k++)
	sum += dist[k] * dist[k];
    }

    sum /= ((float) numdata)*((float) dimension);
    if (out_flag)
      printf("distortion: %f\n", sum);
    if (prev-sum<TINY)
      break;
    
    for (i=0; i<numcodes ; i++) 
      Centroid(codebook[i], data, labels, numdata, dimension, i);
    
    count++;
    prev = sum;
  }

  free((char *) labels);
  free((char *) dist);
}


/*
 * Load_Codebook(filename, codebook)
 *
 * This procedure loads in a previous save codebook for further training
 *
 * filename (in): name of codebook file to read in
 * codebook (out): pointer to codebook matrix
 *
 */

void Load_Codebook(char *filename, float **codebook)
{
  int  i, j;                         /* looop counter variables      */
  FILE *cdvec;

  if (!(cdvec = fopen(filename, "r"))) {
    fprintf(stderr, "Error: Load_Codebook:cdvec\n");
    exit(1);
  }

  fscanf(cdvec, "%d\n", &numcv);
  fscanf(cdvec, "%d\n", &ip_dimension);
  if (out_flag) {
    printf("Number of codes: %d\n", numcv);
    printf("Dimensionality : %d\n", ip_dimension);
  }
  fread(&headorder, sizeof(char), 1, cdvec);
  byteswap = (((headorder) && (!machineorder)) ||
	      ((!headorder) && (machineorder)));

  for (i=0; i<numcv; i++)
    for (j=0; j<ip_dimension; j++) {
      fread(&codebook[i][j], sizeof(float), 1, cdvec);
      if (byteswap) 
	codebook[i][j] = SWAPF(codebook[i][j]);
    }

  fclose(cdvec);
}


/*
 * Generate_RndVec(rndvec, numcodes, numpoints)
 *
 * this procedure generates a random vector of points
 *
 * rndvec  (out): output random vector
 * numcodes (in): dimension (number of points in rnd vec)
 * numpoints(in): number of points for spreading
 *
 */

void Generate_RndVec (int *rndvec, int numcodes, int numpoints)
{
  int i, j;
  int flag;
  int index;

  srand(rnd);
  for (i=0; i<numcodes; i++)
    rndvec[i] = 0;
  
  for (j=0; j<numcodes; j++) {
    flag = 1;
    while (flag) {
      index = (int) (rand_val()*((double) numpoints));
      flag = 0;
      for (i=0; i<numcodes; i++)
	flag = flag || (rndvec[i] == index);
    }
    rndvec[j] = index;
  }
}
  

/*
 * main controlling body
 *
 */

main(int argc,  char **argv)
{
  int    i, j, k, l;
  float  **codevect;
  float  **input_matrix;
  int    *rand_vect;

  /* initializations */
  get_comline(argc, argv);

  input_matrix = get_data(FeatureFile);
  if (!(codevect = (float **) Alloc2d(numcv, ip_dimension,  sizeof(float))))
    eerror("main: codevect --> Alloc2d failed");
  
  if (!(rand_vect = (int *) calloc(numcv, sizeof(int)))) {
    fprintf(stderr, "error allocating memory rand_vect\n");
    exit(1);
  }
  
  if (test_flag) {
    iters = 1;
    out_flag = 1;
    trainflag = 1;
  }

  if (!trainflag) {
    Generate_RndVec(rand_vect, numcv, no_of_inputs);

    for (j=0; j<numcv; j++) {
      k = rand_vect[j];
      for (l=0; l<ip_dimension; l++)
	codevect[j][l] = input_matrix[k][l];
    }
  } else
    Load_Codebook(CodeFile, codevect);

  LBG_Codebook(codevect, input_matrix, numcv, no_of_inputs,  
	       ip_dimension, iters);

  /* save codebook to disk */
  if (!(test_flag))
    print_op(CodeFile, codevect, numcv, ip_dimension);


  Free2d((char **) input_matrix);
  Free2d((char **) codevect);
  free((char *) rand_vect);
}


