/* Performance testing of matrix multiplies */
#include <stdio.h>
#include <stdlib.h>
#include "csr.h"
#include "ftime.h"

/* Global flag to determine which version of each multiply to use */

/* Maximum number of tests */
#define MAX_TEST 100

static int gfast = 0;
static matrix_t gtype = DENSE;
static int ntest = 0;
static int nrow = 0;
static int nentries = 0;

ftype_t *vec, *prod;
csr_ptr data[TCOUNT][MAX_TEST];
void *calloc();

static void gen_data(int acount, int anrow, float density)
{
  int i;
  nrow = anrow;
  ntest = acount;
  nentries = (int) (density * nrow * nrow + 0.5);
  vec = rvec(nrow);
  prod = calloc(nrow, sizeof(ftype_t));
  for (i = 0; i < acount; i++) {
    csr_ptr m = gen_dense_matrix(nrow, nentries);
    data[DENSE][i] = m;
    data[UNSCALED][i] = retype_matrix(m, UNSCALED);
    data[SCALED][i] = retype_matrix(m, SCALED);
    /* Make sure multiply routines are OK */
    if (!test_mult(m, vec))
      fprintf(stderr, "Oops.  Better fix this before benchmarking!\n");
  }
}

static void set_test(matrix_t type, int fast)
{
  gfast = fast;
  gtype = type;
}

/* Run a test */
static void matrix_test(void)
{
  int i;
  for (i = 0; i < ntest; i++)
    csr_mult(data[gtype][i], vec, prod, gfast);
}

static void dummy_test(void)
{
  int i;
  for (i = 0; i < ntest; i++)
    csr_dummy_mult(data[gtype][i], vec, prod, gfast);
}
	

#define ERRTOL 0.01
/* Compatible with old version of code */
static float time_funct(test_funct f, test_funct dummy) {
  double tf = ftime(f, ERRTOL);
  double td = (dummy != NULL) ?  ftime(dummy, ERRTOL) : 0.0;
  return (tf - td) * 1e6; /* Convert to usecs */
}

main (int argc, char *argv[])
{
  float times[TCOUNT][2];
  float clocks[TCOUNT][2];
  double mhz;
  int acount = 5;
  int anrow = 10;
  matrix_t t;
  int o;
  float adensity = 1.0;
  /* Arguments: count, nrow, density, [mhz] */
  if (argc > 1 && (argc < 4 || argc > 5)) {
    fprintf(stderr, "Usage: %s count nrow density [mhz]\n", argv[0]);
    exit(0);
  }
  if (argc > 1) {
    sscanf(argv[1], "%d", &acount);
    sscanf(argv[2], "%d", &anrow);
    sscanf(argv[3], "%f", &adensity);
  }
  if (argc > 4)
    sscanf(argv[4], "%f", &mhz);
  else
    mhz = freq();
  if (adensity < 0.0 || adensity > 1.0) {
    fprintf(stderr, "Invalid density: %f\n", adensity);
    exit(1);
  }
  printf("%d matrices, each %d X %d and %f density\n",
	 acount, anrow, anrow, adensity);
  printf("Generating and testing data ... "); 
  gen_data(acount, anrow, adensity);
  printf(" ... done\n");
  for (t = 0; t < TCOUNT; t++) {
    printf("Matrix type %d. ", t);  
    for (o = 0; o < 2; o++) {
      set_test(t, o);
      times[t][o] = time_funct(matrix_test, dummy_test);
      printf(" Done. ");
      clocks[t][o] = mhz * times[t][o] / (float) (nentries * acount);
      printf("%fMHZ * %fUS / (%d * %d) = %f\n",
	     mhz, times[t][o], nentries, acount, clocks[t][o]);
    }
    printf("\n");
  }
  /* Display results */
  printf("Multiplying %d X %d matrices with %f density at %f MHZ\n",
	 anrow, anrow, adensity, mhz);
  printf("Times in microseconds:\n");
  printf("\tDense (S) %f (F) %f\n\tUnscaled (S) %f (F) %f\n\tScaled (S) %f (F) %f\n",
	 times[DENSE][0], times[DENSE][1], 
	 times[UNSCALED][0], times[UNSCALED][1], 
	 times[SCALED][0], times[SCALED][1]);
  printf("Approx. clocks per element\n");
  printf("\tDense (S) %f (F) %f\n\tUnscaled (S) %f (F) %f\n\tScaled (S) %f (F) %f\n",
	 clocks[DENSE][0], clocks[DENSE][1], 
	 clocks[UNSCALED][0], clocks[UNSCALED][1], 
	 clocks[SCALED][0], clocks[SCALED][1]);

  printf("Condensed benchmark numbers (MHz, microseconds, cycles)\n");
  printf("RESULT\t%f\t%f\t%f\n", mhz, times[UNSCALED][1], clocks[UNSCALED][1]);
  exit(0);

}
