/* Test of loop unrolling & software pipelining */
#include <stdio.h>
#include "ftime.h"

#define CNT 256
static double a[CNT], b[CNT];
static double c = 3.14159;

static void loop(void)
{
  int i;
  for (i = 0; i < CNT; i++)
    a[i] = b[i] * c; 
}

static void loop1(void)
{
  double *anext = a;
  double *bnext = b;
  double *bdone = b+CNT;
  double tc = c;
  while (bnext < bdone) 
    *anext++ = *bnext++ * tc;
}

static void loop2(void)
{
  double *anext = a;
  double *bnext = b;
  double *bdone = b+CNT;
  double tc = c;
  while (bnext < bdone) {
    double b0 = bnext[0];
    double b1 = bnext[1];
    bnext += 2;
    anext[0] = b0 * tc;
    anext[1] = b1 * tc;
    anext += 2;
  }
}

static void loop4(void)
{
  double *anext = a;
  double *bnext = b;
  double *bdone = b+CNT;
  double tc = c;
  while (bnext < bdone) {
    double b0 = bnext[0];
    double b1 = bnext[1];
    double b2 = bnext[2];
    double b3 = bnext[3];
    bnext += 4;
    anext[0] = b0 * tc;
    anext[1] = b1 * tc;
    anext[2] = b2 * tc;
    anext[3] = b3 * tc;
    anext += 4;
  }
}

static void pipe(void)
{
  double tc = c;
  double prod = b[0] * tc;
  double load = b[1];
  double *anext = a;
  double *bnext = b+2;
  double *bdone = b+CNT;
  while (bnext < bdone) {
    *anext++ = prod;
    prod = load * tc;
    load = *bnext++;
  }
  a[CNT-2] = prod;
  a[CNT-1] = load * tc;
}

static void pipe2(void)
{
  double tc = c;
  double prod0 = b[0] * tc;
  double prod1 = b[1] * tc;
  double load0 = b[2];
  double load1 = b[3];
  double *anext = a;
  double *bnext = b+4;
  double *bdone = b+CNT;
  while (bnext < bdone) {
    anext[0] = prod0;
    anext[1] = prod1;
    anext += 2;
    prod0 = load0 * tc;
    prod1 = load1 * tc;
    load0 = bnext[0];
    load1 = bnext[1];
    bnext+= 2;
  }
  a[CNT-4] = prod0;
  a[CNT-3] = prod1; 
  a[CNT-2] = load0 * tc;
  a[CNT-1] = load1 * tc;
}

#define ERRTOL 0.01

int main()
{
  int i;
  float lt, l1t, l2t, l4t, pt, p2t;
  float wt = freq() * (1e6 / CNT);
  for (i = 0; i < CNT; i++) {
    a[i] = (double) i;
  }
  lt = ftime(loop, ERRTOL);
  printf("Loop: %f clocks\n", lt * wt); 
  l1t = ftime(loop1, ERRTOL);
  printf("Loop1: %f clocks\n", l1t * wt); 
  l2t = ftime(loop2, ERRTOL);
  printf("Loop X2: %f clocks\n", l2t * wt); 
  l4t = ftime(loop4, ERRTOL);
  printf("Loop X4: %f clocks\n", l4t * wt); 
  pt = ftime(pipe, ERRTOL);
  printf("Pipe: %f clocks\n", pt * wt); 
  p2t = ftime(pipe2, ERRTOL);
  printf("Pipe2: %f clocks\n", p2t * wt); 
}
