
#include "defs.h"
#include "conversion.h"


void good_through_unroll2(int *G, int dim)
// 132 msec 0.71
{
  int i, j;
  for (j = 0; j < dim; j+=2)
    for (i = j+1; i < dim; i++){
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)]);
       G[RIDX(j+1,i,dim)]=( G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] |  G[RIDX(i,j+1,dim)]);
    }

}

void
good_throughput_triangle(int *G, int dim){
  int i, j;
  int in;
  int inpj, jnpi;
  in = dim;
  inpj = dim;
  jnpi = 1;
  for (i = 1; i < dim; i++) {
    inpj = in;
    jnpi = i;
    for (j = 0; j < i; j++) {
      G[jnpi] = G[inpj] = G[jnpi] | G[inpj];
      inpj++;
      jnpi+= dim;
    }
    in += dim;
  }
}

void
good_throughput_triangle2(int *G, int dim){
  int i, j;
  int in;
  int inpj, jnpi;
  in = dim;
  inpj = dim;
  jnpi = 1;
  for (i = 1; i < dim; i++) {
    inpj = in;
    jnpi = i;
    for (j = 0; j < i; j+=2) {
      G[jnpi] = G[inpj] = G[jnpi] | G[inpj];
      G[jnpi+dim] = G[inpj+1] = G[jnpi+dim] | G[inpj+1];
      inpj+=2;
      jnpi+= 2*dim;
    }
    in += dim;
  }
}
/* Simple unrolling */
void
good_throughput_unroll_4(int *G, int dim){
  int i, j;
  for (i = 1; i < dim; i++)
    for (j = 0; j < i; j+=4) {
      int adj0, adj1, adj2, adj3;
      adj0 = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
      adj1 = G[RIDX(j+1,i,dim)] | G[RIDX(i,j+1,dim)];
      G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj0;
      adj2 = G[RIDX(j+2,i,dim)] | G[RIDX(i,j+2,dim)];
      G[RIDX(j+1,i,dim)] = G[RIDX(i,j+1,dim)] = adj1;
      adj3 = G[RIDX(j+3,i,dim)] | G[RIDX(i,j+3,dim)];
      G[RIDX(j+2,i,dim)] = G[RIDX(i,j+2,dim)] = adj2;
      G[RIDX(j+3,i,dim)] = G[RIDX(i,j+3,dim)] = adj3;
    }
}

/* Combine elements [i,j] and [j,i] */
#define T1(i,j) \
  G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)]



/* Do 4x4 block */
#define T4(i,j) \
{ T1(i,j); T1(i+1,j); T1(i+2,j); T1(i+3,j); \
  T1(i,j+1); T1(i+1,j+1); T1(i+2,j+1); T1(i+3,j+1); \
  T1(i,j+2); T1(i+1,j+2); T1(i+2,j+2); T1(i+3,j+2); \
  T1(i,j+3); T1(i+1,j+3); T1(i+2,j+3); T1(i+3,j+3); }

void
good_throughput_block(int *G, int dim){
  int i, j;
  int ii, jj;
  for (ii = 0; ii < dim; ii+= 8) {
    /* Diagonal Blocks */
    jj = ii;
    for (i = ii+7; i >= ii+1; i--)
      for (j = jj; j < i; j++) {
	int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
	G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
      }
  }

  /* Other Blocks */
  for (ii = 8; ii < dim; ii+= 8) {
    for (jj = 0; jj < ii; jj+= 8) {
      /* UL */
      T4(ii, jj)
      /* UR */
      T4(ii,jj+4)
      /* LR */
      T4(ii+4,jj+4)
      /* LL */
      T4(ii+4,jj)
    }
  }
}  

void
good_throughput(int *G, int dim){
     if(dim == 64){
      good_throughput_block(G, dim);
     }
     else  good_through_unroll2(G, dim);
}
