// #include "transpose.h"
// #include "cache.h"
#include "defs.h"
#include "conversion.h"
#include<stdio.h>

team_t team = {
    /* Team name to be displayed on webpage */
    "Bianca",
    /* First member full name */
    "Bianca Schroeder",
    /* First member email address */
    "bianca@cs.cmu.edu",
    /* Second member full name (leave blank if none) */
    "",
    /* Second member email address (blank if none) */
    ""
};

naive_cache(int *G, int dim){
  int i, j;
  for (i = 0; i < dim; i++)
    for (j = 0; j < dim; j++)
      G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] || G[RIDX(i,j,dim)];
 
}  

void
good_cache_triangle(int *G, int dim){
  int i, j;
  for (i = 0; i < dim; i++)
    for (j = 0; j < i; j++) {
      int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
      G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
    }
}

void
good_cache_b8(int *G, int dim){
  int i, j;
  int ii, jj;
  for (ii = 0; ii < dim; ii+= 8) {
    /* Diagonal Blocks */
    for (i = ii; i < ii+8; i++)
      for (j = ii; j < i; j++) {
        int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
        G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
      }

    /* Other Blocks */
    for (jj = 0; jj < ii; jj+= 8)
      for (i = ii; i < ii+8; i++)
        for (j = jj; j < jj+8; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
  }
}

void
good_cache_b4(int *G, int dim){
  int i, j;
  int ii, jj;
  for (ii = 0; ii < dim; ii+= 4) {
    /* Diagonal Blocks */
    for (i = ii; i < ii+4; i++)
      for (j = ii; j < i; j++) {
        int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
        G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
      }

    /* Other Blocks */
    for (jj = 0; jj < ii; jj+= 4)
      for (i = ii; i < ii+4; i++)
        for (j = jj; j < jj+4; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
  }
}

void
hybrid_block(int *G, int dim){
  if(dim<1024)good_cache_b8(G, dim); 
  else good_cache_b4(G, dim);
}

void
good_cache_8b(int *G, int dim){
  int i, j;
  int bi, bj;
  int ii, jj;
  int buf1[8][8];
  int buf2[8][8];
  for (ii = 0; ii < dim; ii+= 8) {
    /* Diagonal Blocks */
    jj = ii;
    for (bi = 0; bi < 8; bi++)
      for (bj = 0; bj < 8; bj++) {
        i = ii+bi;
        j = jj+bj;
        buf1[bi][bj] = G[RIDX(i,j,dim)];
      }
    for (bi = 0; bi < 8; bi++)
      for (bj = 0; bj < bi; bj++) {
        int adj = buf1[bi][bj] | buf1[bj][bi];
        i = ii+bi;
        j = jj+bj;
        G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
      }

    /* Other Blocks */
    for (jj = 0; jj < ii; jj+= 8) {
      for (bi = 0; bi < 8; bi++)
        for (bj = 0; bj < 8; bj++) {
          i = ii+bi;
          j = jj+bj;
          buf1[bi][bj] = G[RIDX(i,j,dim)];
          buf2[bj][bi] = G[RIDX(j,i,dim)];
        }
      for (bi = 0; bi < 8; bi++)
        for (bj = 0; bj < 8; bj++) {
          int adj = buf1[bi][bj] | buf2[bj][bi];
          i = ii+bi;
          j = jj+bj;
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
    }
  }
}

void
hybrid_cache(int *G, int dim){
    if (dim < 1024)
      good_cache_b8(G, dim);
    else
      good_cache_b4(G, dim);
}


void
good_cache_randy(int *G, int dim){
  int i, j;
  int ii, jj;
  for (ii = 0; ii < dim; ii+= 8) {
    /* Diagonal Blocks */
    jj = ii;
    for (i = ii+7; i >= ii+1; i--)
      for (j = jj; j < i; j++) {
        int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
        G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
      }
  }

  /* Other Blocks */
  for (ii = 8; ii < dim; ii+= 8) {
    for (jj = 0; jj < ii; jj+= 8) {
      /* UL */
      for (i = ii; i < ii+4; i++)
        for (j = jj; j < jj+4; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
      /* UR */
      for (i = ii; i < ii+4; i++)
        for (j = jj+4; j < jj+8; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
      /* LR */
      for (i = ii+4; i < ii+8; i++)
        for (j = jj+4; j < jj+8; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
      /* LL */
      for (i = ii+4; i < ii+8; i++)
        for (j = jj; j < jj+4; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
    }
  }
} 
void
good_cache_randy_2(int *G, int dim){
  int i, j;
  int ii, jj;
  for (ii = 0; ii < dim; ii+= 8) {
    /* Diagonal Blocks */
    jj = ii;
    for (i = ii+7; i >= ii+1; i--)
      for (j = jj; j < i; j++) {
        int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
        G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
      }
  }

  /* Other Blocks */
  for (ii = 8; ii < dim; ii+= 8) {
    for (jj = 0; jj < ii; jj+= 8) {
      /* UL */
      for (i = ii; i < ii+4; i++)
        for (j = jj; j < jj+4; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
      /* UR */
      for (i = ii; i < ii+4; i++)
        for (j = jj+4; j < jj+8; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
      /* LL */
      for (i = ii+4; i < ii+8; i++)
        for (j = jj; j < jj+4; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
      /* LR */
      for (i = ii+4; i < ii+8; i++)
        for (j = jj+4; j < jj+8; j++) {
          int adj = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
          G[RIDX(j,i,dim)] = G[RIDX(i,j,dim)] = adj;
        }
    }
  }
}



void
hybrid_cache_2(int *G, int dim){
    if(dim <= 64){
      good_cache_triangle(G, dim);
    }
    else{
      good_cache_randy(G, dim);
    }
}

void row_convert(int *G, int dim)
// 230 msec, .624
{
  int i, j;
  for (i = 0; i < dim; i++)
    for (j = 0; j < dim; j++)
      G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] || G[RIDX(i,j,dim)];
}

void row_convert_arith(int *G, int dim)
// 230 msec, .624
{
  int i, j;
  for (i = 0; i < dim; i++)
    for (j = 0; j < dim; j++)
      G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] | G[RIDX(i,j,dim)];
}



void col_convert(int *G, int dim)
// 263 msec, .71
// 1,046,223 references
{
  int i, j;
  for (j = 0; j < dim; j++)
    for (i = 0; i < dim; i++)
       G[RIDX(j,i,dim)] =  G[RIDX(i,j,dim)] || G[RIDX(j,i,dim)];
}

void col_convert_arith(int *G, int dim)
// 263 msec, .71
// 1,046,223 references
{
  int i, j;
  for (j = 0; j < dim; j++)
    for (i = 0; i < dim; i++)
       G[RIDX(j,i,dim)] =  G[RIDX(i,j,dim)] | G[RIDX(j,i,dim)];
}


void col_convert_2(int *G, int dim)
// 126 msec, 0.718
// 525,086
{
  int i, j;
  for (j = 0; j < dim; j++)
    for (i = j+1; i < dim; i++)
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] ||  G[RIDX(i,j,dim)]);
}

void col_convert_2_arith(int *G, int dim)
// 126 msec, 0.718
// 525,086
{
  int i, j;
  for (j = 0; j < dim; j++)
    for (i = j+1; i < dim; i++)
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)]);
}



void col_convert_unroll_2(int *G, int dim)
// 132 msec 0.71
{
  int i, j;
  for (j = 0; j < dim; j+=2)
    for (i = 0; i < dim; i++){
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] ||  G[RIDX(i,j,dim)]);
       G[RIDX(j+1,i,dim)]=( G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] ||  G[RIDX(i,j+1,dim)]);
    }
}

void col_convert_unroll_2_arith(int *G, int dim)
// 132 msec 0.71
{
  int i, j;
  for (j = 0; j < dim; j+=2)
    for (i = 0; i < dim; i++){
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)]);
       G[RIDX(j+1,i,dim)]=( G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] |  G[RIDX(i,j+1,dim)]);
    }
}



void col_convert_2_unroll_2(int *G, int dim)
// 81 msec, 0.8126
{
  int i, j;
  for (j = 0; j < dim; j+=2)
    for (i = j+1; i < dim; i++){
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] ||  G[RIDX(i,j,dim)]);
       G[RIDX(j+1,i,dim)]=( G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] ||  G[RIDX(i,j+1,dim)]);
    }
}

void col_convert_2_unroll_2_arith(int *G, int dim)
// 81 msec, 0.8126
{
  int i, j;
  for (j = 0; j < dim; j+=2)
    for (i = j+1; i < dim; i++){
       G[RIDX(j,i,dim)]=( G[RIDX(i,j,dim)] = G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)]);
       G[RIDX(j+1,i,dim)]=( G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] |  G[RIDX(i,j+1,dim)]);
    }
}




void col_convert_unroll_4(int *G, int dim)
// 65 msec, factor 4 faster than col_convert, hit ratio .71
// much smaller total number of references
// total number of references: 261,873 
{
  int i, j;
  for (j = 0; j < dim; j+=4)
    for (i = 0; i < dim; i++){
       G[RIDX(j,i,dim)] =  G[RIDX(i,j,dim)] || G[RIDX(j,i,dim)];
       G[RIDX(j+1,i,dim)] =  G[RIDX(i,j+1,dim)] || G[RIDX(j+1,i,dim)];
       G[RIDX(j+2,i,dim)] =  G[RIDX(i,j+2,dim)] || G[RIDX(j+2,i,dim)];
       G[RIDX(j+3,i,dim)] =  G[RIDX(i,j+3,dim)] || G[RIDX(j+3,i,dim)];
    }
}

void col_convert_unroll_4_arith(int *G, int dim)
// best throughput
{
  int i, j;
  for (j = 0; j < dim; j+=4)
    for (i = 0; i < dim; i++){
       G[RIDX(j,i,dim)] =  G[RIDX(i,j,dim)] | G[RIDX(j,i,dim)];
       G[RIDX(j+1,i,dim)] =  G[RIDX(i,j+1,dim)] | G[RIDX(j+1,i,dim)];
       G[RIDX(j+2,i,dim)] =  G[RIDX(i,j+2,dim)] | G[RIDX(j+2,i,dim)];
       G[RIDX(j+3,i,dim)] =  G[RIDX(i,j+3,dim)] | G[RIDX(j+3,i,dim)];
    }
}





void col_convert_2_unroll_4(int *G, int dim)
// 165 msec, 0.948
// total number of references: 957,974
{
  int i, j, sum;
    for (j = 0; j < dim; j+=4)
      for (i = j+1; i < dim; i++) {
        G[RIDX(i,j,dim)]=( G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] ||  G[RIDX(i,j,dim)]);
        G[RIDX(i,j+1,dim)] = (G[RIDX(j+1,i,dim)] = G[RIDX(j+1,i,dim)] ||  G[RIDX(i,j+1,dim)]);
        G[RIDX(i,j+2,dim)] = (G[RIDX(j+2,i,dim)] = G[RIDX(j+2,i,dim)] ||  G[RIDX(i,j+2,dim)]);
        G[RIDX(i,j+3,dim)] = (G[RIDX(j+3,i,dim)] = G[RIDX(j+3,i,dim)] ||  G[RIDX(i,j+3,dim)]);

      }
}

void col_convert_2_unroll_4_arith(int *G, int dim)
// 165 msec, 0.948
// total number of references: 957,974
{
  int i, j, sum;
    for (j = 0; j < dim; j+=4)
      for (i = j+1; i < dim; i++) {
        G[RIDX(i,j,dim)]=( G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)]);
        G[RIDX(i,j+1,dim)] = (G[RIDX(j+1,i,dim)] = G[RIDX(j+1,i,dim)] |  G[RIDX(i,j+1,dim)]);
        G[RIDX(i,j+2,dim)] = (G[RIDX(j+2,i,dim)] = G[RIDX(j+2,i,dim)] |  G[RIDX(i,j+2,dim)]);
        G[RIDX(i,j+3,dim)] = (G[RIDX(j+3,i,dim)] = G[RIDX(j+3,i,dim)] |  G[RIDX(i,j+3,dim)]);

      }
}



void col_convert_unroll_8(int *G, int dim)
// 357. msec 0.9656
{
  int i, j, sum;
    for (j = 0; j < dim; j+=8)
      for (i = 0; i < dim; i++) {
        G[RIDX(i,j,dim)]= G[RIDX(j,i,dim)] ||  G[RIDX(i,j,dim)];
        G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] ||  G[RIDX(i,j+1,dim)];
        G[RIDX(i,j+2,dim)] =  G[RIDX(j+2,i,dim)] ||  G[RIDX(i,j+2,dim)];
        G[RIDX(i,j+3,dim)] =  G[RIDX(j+3,i,dim)] ||  G[RIDX(i,j+3,dim)];
        G[RIDX(i,j+4,dim)] =  G[RIDX(j+4,i,dim)] ||  G[RIDX(i,j+4,dim)];
        G[RIDX(i,j+5,dim)] =  G[RIDX(j+5,i,dim)] ||  G[RIDX(i,j+5,dim)];
        G[RIDX(i,j+6,dim)] = G[RIDX(j+6,i,dim)] ||  G[RIDX(i,j+6,dim)];
        G[RIDX(i,j+7,dim)] =  G[RIDX(j+7,i,dim)] ||  G[RIDX(i,j+7,dim)];

      }
}

void col_convert_unroll_8_arith(int *G, int dim)
// 357. msec 0.9656
{
  int i, j, sum;
    for (j = 0; j < dim; j+=8)
      for (i = 0; i < dim; i++) {
        G[RIDX(i,j,dim)]= G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)];
        G[RIDX(i,j+1,dim)] = G[RIDX(j+1,i,dim)] |  G[RIDX(i,j+1,dim)];
        G[RIDX(i,j+2,dim)] =  G[RIDX(j+2,i,dim)] |  G[RIDX(i,j+2,dim)];
        G[RIDX(i,j+3,dim)] =  G[RIDX(j+3,i,dim)] |  G[RIDX(i,j+3,dim)];
        G[RIDX(i,j+4,dim)] =  G[RIDX(j+4,i,dim)] |  G[RIDX(i,j+4,dim)];
        G[RIDX(i,j+5,dim)] =  G[RIDX(j+5,i,dim)] |  G[RIDX(i,j+5,dim)];
        G[RIDX(i,j+6,dim)] = G[RIDX(j+6,i,dim)] |  G[RIDX(i,j+6,dim)];
        G[RIDX(i,j+7,dim)] =  G[RIDX(j+7,i,dim)] |  G[RIDX(i,j+7,dim)];

      }
}




void col_convert_2_unroll_8(int *G, int dim)
// 167 msec, 0.94
{
  int i, j, sum;
    for (j = 0; j < dim; j+=8)
      for (i = j+1; i < dim; i++) {
        G[RIDX(i,j,dim)]=( G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] ||  G[RIDX(i,j,dim)]);
        G[RIDX(i,j+1,dim)] = (G[RIDX(j+1,i,dim)] = G[RIDX(j+1,i,dim)] ||  G[RIDX(i,j+1,dim)]);
        G[RIDX(i,j+2,dim)] = (G[RIDX(j+2,i,dim)] = G[RIDX(j+2,i,dim)] ||  G[RIDX(i,j+2,dim)]);
        G[RIDX(i,j+3,dim)] = (G[RIDX(j+3,i,dim)] = G[RIDX(j+3,i,dim)] ||  G[RIDX(i,j+3,dim)]);
        G[RIDX(i,j+4,dim)] = (G[RIDX(j+4,i,dim)] = G[RIDX(j+4,i,dim)] ||  G[RIDX(i,j+4,dim)]);
        G[RIDX(i,j+5,dim)] = (G[RIDX(j+5,i,dim)] = G[RIDX(j+5,i,dim)] ||  G[RIDX(i,j+5,dim)]);
        G[RIDX(i,j+6,dim)] = (G[RIDX(j+6,i,dim)] = G[RIDX(j+6,i,dim)] ||  G[RIDX(i,j+6,dim)]);
        G[RIDX(i,j+7,dim)] = (G[RIDX(j+7,i,dim)] = G[RIDX(j+7,i,dim)] ||  G[RIDX(i,j+7,dim)]);
      
      }
}

void col_convert_2_unroll_8_arith(int *G, int dim)
// 167 msec, 0.94
{
  int i, j, sum;
    for (j = 0; j < dim; j+=8)
      for (i = j+1; i < dim; i++) {
        G[RIDX(i,j,dim)]=( G[RIDX(j,i,dim)] = G[RIDX(j,i,dim)] |  G[RIDX(i,j,dim)]);
        G[RIDX(i,j+1,dim)] = (G[RIDX(j+1,i,dim)] = G[RIDX(j+1,i,dim)] |  G[RIDX(i,j+1,dim)]);
        G[RIDX(i,j+2,dim)] = (G[RIDX(j+2,i,dim)] = G[RIDX(j+2,i,dim)] |  G[RIDX(i,j+2,dim)]);
        G[RIDX(i,j+3,dim)] = (G[RIDX(j+3,i,dim)] = G[RIDX(j+3,i,dim)] |  G[RIDX(i,j+3,dim)]);
        G[RIDX(i,j+4,dim)] = (G[RIDX(j+4,i,dim)] = G[RIDX(j+4,i,dim)] |  G[RIDX(i,j+4,dim)]);
        G[RIDX(i,j+5,dim)] = (G[RIDX(j+5,i,dim)] = G[RIDX(j+5,i,dim)] |  G[RIDX(i,j+5,dim)]);
        G[RIDX(i,j+6,dim)] = (G[RIDX(j+6,i,dim)] = G[RIDX(j+6,i,dim)] |  G[RIDX(i,j+6,dim)]);
        G[RIDX(i,j+7,dim)] = (G[RIDX(j+7,i,dim)] = G[RIDX(j+7,i,dim)] |  G[RIDX(i,j+7,dim)]);

      }
}





/* Matrix transpose based on row-wise scan of source matrix */
char row_descr[] = "Row-wise scan";
void row_transpose(int *dest, int dim)
{
  int i, j;
  for (i = 0; i < dim; i++)
    for (j = 0; j < dim; j++)
      dest[RIDX(j,i,dim)] = dest[RIDX(i,j,dim)] || dest[RIDX(j,i,dim)];
}


char col_descr[] = "Column-wise scan";
/* Matrix transpose based on column-wise scan of source matrix */
void col_transpose(int *dest, int *src, int dim)
{ 
  int i, j;
    for (j = 0; j < dim; j++)
      for (i = 0; i < dim; i++)
        dest[RIDX(j,i,dim)]= src[RIDX(i,j,dim)];
}



char col4_descr[] = "4-wide Column-wise scan";
/* Matrix transpose based on 4-wide column-wise scan of source matrix */
void col4_transpose(int *dest, int *src, int dim)
{
  int i, j;
    for (j = 0; j < dim; j+=4)
      for (i = 0; i < dim; i++) {
        COPY(&dest[RIDX(j,i,dim)], &src[RIDX(i,j,dim)]);
        COPY(&dest[RIDX(j+1,i,dim)], &src[RIDX(i,j+1,dim)]);
        COPY(&dest[RIDX(j+2,i,dim)], &src[RIDX(i,j+2,dim)]);
        COPY(&dest[RIDX(j+3,i,dim)], &src[RIDX(i,j+3,dim)]);
      }
}




/* Cache optimizations */
/* Transpose 4x4 array */

void c4(int *G, int dim)
{
  int i;
  int j;
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++)
      G[RIDX(j,i,dim)]= G[RIDX(i,j,dim)] ||  G[RIDX(j,i,dim)] ;
}


void t4(int *dest, int *src, int dim)
{
  int i;
  int j;
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++)
      dest[RIDX(j,i,dim)] =  src[RIDX(i,j,dim)] || dest[RIDX(j,i,dim)];
}

void t4_arith(int *dest, int *src, int dim)
{
  int i;
  int j;
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++)
      dest[RIDX(j,i,dim)] =  src[RIDX(i,j,dim)] | dest[RIDX(j,i,dim)];
}


void c4o(int *G, int dim)
{
  int i;
  int j;
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++) {
      int jj = (i+j+1) % 4;
      G[RIDX(jj,i,dim)]= G[RIDX(i,jj,dim)] || G[RIDX(jj,i,dim)];
    }
}


/* Transpose 4x4 array, saving diagonal element to end */
void t4o(int *dest, int *src, int dim)
{
  int i;
  int j;
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++) {
      int jj = (i+j+1) % 4;
      // COPY(dest+RIDX(jj,i,dim), src+RIDX(i,jj,dim));
      dest[RIDX(jj,i,dim)] =  src[RIDX(i,jj,dim)] || dest[RIDX(jj,i,dim)];
    }
}

void t4o_arith(int *dest, int *src, int dim)
{
  int i;
  int j;
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++) {
      int jj = (i+j+1) % 4;
      // COPY(dest+RIDX(jj,i,dim), src+RIDX(i,jj,dim));
      dest[RIDX(jj,i,dim)] =  src[RIDX(i,jj,dim)] | dest[RIDX(jj,i,dim)];
    }
}


/* Do 8x8 blocks divided into 4x4 quadrants */
void row8qo_transpose_helper(int *dest, int *src, int size, int dim)
{
  int i, j;
  for (i = 0; i < size; i+=8) {
    for (j = 0; j < size; j+=8) {
      t4(dest+RIDX(j+4,i,dim), src+RIDX(i,j+4,dim), dim);
      t4o(dest+RIDX(j,i,dim), src+RIDX(i,j,dim), dim);
      t4(dest+RIDX(j,i+4,dim), src+RIDX(i+4,j,dim), dim);
      t4o(dest+RIDX(j+4,i+4,dim), src+RIDX(i+4,j+4,dim), dim);
    }
  }
}

void row8qo_transpose_helper_arith(int *dest, int *src, int size, int dim)
{
  int i, j;
  for (i = 0; i < size; i+=8) {
    for (j = 0; j < size; j+=8) {
      t4_arith(dest+RIDX(j+4,i,dim), src+RIDX(i,j+4,dim), dim);
      t4o_arith(dest+RIDX(j,i,dim), src+RIDX(i,j,dim), dim);
      t4_arith(dest+RIDX(j,i+4,dim), src+RIDX(i+4,j,dim), dim);
      t4o_arith(dest+RIDX(j+4,i+4,dim), src+RIDX(i+4,j+4,dim), dim);
    }
  }
}



void quado_transpose_helper(int *dest, int *src, int size,
			    int dim, int size_cutoff, int toggle)
{
  int s2 = size >> 1;
  int i, j;
  /* This code is intentionally broken for non-powers of 2
     It can be fixed by changing the loop condition to:
        size <= size_cutoff || s2 % 8 != 0
  */
  if (size <= size_cutoff) {
    /* Stop the recursion */
    row8qo_transpose_helper(dest, src, size, dim);
    return;
  }
  if (toggle) {
    for (i = 0; i <= 1; i++)
      for (j = 1; j >= 0; j--)
	quado_transpose_helper(dest+RIDX(j*s2,i*s2,dim),
				src+RIDX(i*s2,j*s2,dim),
				s2, dim, size_cutoff, toggle);
  } else {
    for (i = 1; i >= 0; i--)
      for (j = 0; j <= 1; j++)
	quado_transpose_helper(dest+RIDX(j*s2,i*s2,dim),
				src+RIDX(i*s2,j*s2,dim),
				s2, dim, size_cutoff, toggle);
  }
}

void quado_transpose_helper_arith(int *dest, int *src, int size,
                            int dim, int size_cutoff, int toggle)
{
  int s2 = size >> 1;
  int i, j;
  /* This code is intentionally broken for non-powers of 2
     It can be fixed by changing the loop condition to:
        size <= size_cutoff || s2 % 8 != 0
  */
  if (size <= size_cutoff) {
    /* Stop the recursion */
    row8qo_transpose_helper_arith(dest, src, size, dim);
    return;
  }
  if (toggle) {
    for (i = 0; i <= 1; i++)
      for (j = 1; j >= 0; j--)
        quado_transpose_helper_arith(dest+RIDX(j*s2,i*s2,dim),
                                src+RIDX(i*s2,j*s2,dim),
                                s2, dim, size_cutoff, toggle);
  } else {
    for (i = 1; i >= 0; i--)
      for (j = 0; j <= 1; j++)
        quado_transpose_helper_arith(dest+RIDX(j*s2,i*s2,dim),
                                src+RIDX(i*s2,j*s2,dim),
                                s2, dim, size_cutoff, toggle);
  }
}



char quadrec_descr[] = "Recursive Quadrant Scan, toggling, Diagonal last";
void quadrec_transpose(int *dest, int *src, int dim)
{
  static int toggle = 1;
  quado_transpose_helper(dest, src, dim, dim, 8, toggle);
  toggle = 1-toggle;
}

void quadrec_transpose_arith(int *dest, int *src, int dim)
{
  static int toggle = 1;
  quado_transpose_helper_arith(dest, src, dim, dim, 8, toggle);
  toggle = 1-toggle;
}

char row8_descr[] = "8 Row Scan";
/* Do 8 rows at a time */
void row8_transpose(int *dest, int *src, int dim)
{
  int i, j, ii;
  for (i = 0; i < dim; i+=8) {
    int lim = i+8;
    for (j = 0; j < dim; j++) {
      for (ii = i; ii < lim; ii++)
	COPY(&dest[RIDX(j,ii,dim)], &src[RIDX(ii,j,dim)]);
    }
  }
}

char unroll8_descr[] = "Unrolled 8 Rows";
void unroll8_transpose(int *dest, int *src, int dim)
{
  int i, j;
  for (i = 0; i < dim; i+=8) {
    for (j = 0; j < dim; j++) {
      int *dptr = dest+RIDX(j,i,dim);
      int *sptr = src+RIDX(i,j,dim);
      COPY(dptr, sptr); sptr += dim;
      COPY(dptr+1, sptr); sptr += dim;
      COPY(dptr+2, sptr); sptr += dim;
      COPY(dptr+3, sptr); sptr += dim;
      COPY(dptr+4, sptr); sptr += dim;
      COPY(dptr+5, sptr); sptr += dim;
      COPY(dptr+6, sptr); sptr += dim;
      COPY(dptr+7, sptr);
    }
  }
}

void unroll4_transpose(int *dest, int dim)
{
  int i, j;
  for (i = 0; i < dim; i+=4) {
    for (j = 0; j < dim; j++) {
      int *dptr = dest+RIDX(j,i,dim);
      int *sptr = dest+RIDX(i,j,dim);
      COPY(dptr, sptr); sptr += dim;
      COPY(dptr+1, sptr); sptr += dim;
      COPY(dptr+2, sptr); sptr += dim;
      COPY(dptr+3, sptr);
    }
  }
}

char unroll16_descr[] = "Unrolled 16 Rows";
void unroll16_transpose(int *dest, int *src, int dim)
{
  int i, j;
  for (i = 0; i < dim; i+=16) {
    for (j = 0; j < dim; j++) {
      int *dptr = dest+RIDX(j,i,dim);
      int *sptr = src+RIDX(i,j,dim);
      COPY(dptr, sptr); sptr += dim;
      COPY(dptr+1, sptr); sptr += dim;
      COPY(dptr+2, sptr); sptr += dim;
      COPY(dptr+3, sptr); sptr += dim;
      COPY(dptr+4, sptr); sptr += dim;
      COPY(dptr+5, sptr); sptr += dim;
      COPY(dptr+6, sptr); sptr += dim;
      COPY(dptr+7, sptr); sptr += dim;
      COPY(dptr+8, sptr); sptr += dim;
      COPY(dptr+9, sptr); sptr += dim;
      COPY(dptr+10, sptr); sptr += dim;
      COPY(dptr+11, sptr); sptr += dim;
      COPY(dptr+12, sptr); sptr += dim;
      COPY(dptr+13, sptr); sptr += dim;
      COPY(dptr+14, sptr); sptr += dim;
      COPY(dptr+15, sptr);
    }
  }
}

char toggle16_descr[] = "Unrolled 16 Rows, toggle";
void toggle16_transpose(int *dest, int *src, int dim)
{
  static int sense = 0;
  int i, j;
  int start, stop, incr;
  if (sense) {
    start = 0;
    stop = dim;
    incr = 16;
  } else {
    start = dim-16;
    stop = -16;
    incr = -16;
  }
  sense = 1-sense; /* Toggle for next time */
  for (i = start; i != stop; i+=incr) {
    for (j = 0; j < dim; j++) {
      int *dptr = dest+RIDX(j,i,dim);
      int *sptr = src+RIDX(i,j,dim);
      COPY(dptr, sptr); sptr += dim;
      COPY(dptr+1, sptr); sptr += dim;
      COPY(dptr+2, sptr); sptr += dim;
      COPY(dptr+3, sptr); sptr += dim;
      COPY(dptr+4, sptr); sptr += dim;
      COPY(dptr+5, sptr); sptr += dim;
      COPY(dptr+6, sptr); sptr += dim;
      COPY(dptr+7, sptr); sptr += dim;
      COPY(dptr+8, sptr); sptr += dim;
      COPY(dptr+9, sptr); sptr += dim;
      COPY(dptr+10, sptr); sptr += dim;
      COPY(dptr+11, sptr); sptr += dim;
      COPY(dptr+12, sptr); sptr += dim;
      COPY(dptr+13, sptr); sptr += dim;
      COPY(dptr+14, sptr); sptr += dim;
      COPY(dptr+15, sptr);
    }
  }
}

void toggle16_convert(int *dest, int dim)
{
  static int sense = 0;
  int i, j;
  int *src =dest;
  int start, stop, incr;
  sense =1;
  if (sense) {
    start = 0;
    stop = dim;
    incr = 16;
  } else {
    start = dim-16;
    stop = -16;
    incr = -16;
  }
  sense = 1-sense; /* Toggle for next time */
  for (i = start; i != stop; i+=incr) {
    for (j = i; j < dim; j++) {
      int *dptr = dest+RIDX(j,i,dim);
      int *sptr = src+RIDX(i,j,dim);
      T_COPY(dptr, sptr); sptr += dim;
      T_COPY(dptr+1, sptr); sptr += dim;
      T_COPY(dptr+2, sptr); sptr += dim;
      T_COPY(dptr+3, sptr); sptr += dim;
      T_COPY(dptr+4, sptr); sptr += dim;
      T_COPY(dptr+5, sptr); sptr += dim;
      T_COPY(dptr+6, sptr); sptr += dim;
      T_COPY(dptr+7, sptr); sptr += dim;
      T_COPY(dptr+8, sptr); sptr += dim;
      T_COPY(dptr+9, sptr); sptr += dim;
      T_COPY(dptr+10, sptr); sptr += dim;
      T_COPY(dptr+11, sptr); sptr += dim;
      T_COPY(dptr+12, sptr); sptr += dim;
      T_COPY(dptr+13, sptr); sptr += dim;
      T_COPY(dptr+14, sptr); sptr += dim;
      T_COPY(dptr+15, sptr);
    }
  }
}


char toggle16x2_descr[] = "Unrolled 16 Rows X 2 Cols, toggle";
void toggle16x2_transpose(int *dest, int *src, int dim)
{
  static int sense = 0;
  int i, j;
  sense = 1-sense; /* Toggle for next time */
  sense =1;
  if (sense) {
    for (i = 0; i < dim; i+=16) {
      for (j = 0; j < dim; j+=2) {
	int *dptr = dest+RIDX(j,i,dim);
	int *sptr = src+RIDX(i,j,dim);
	COPY(dptr, sptr); sptr += dim;
	COPY(dptr+1, sptr); sptr += dim;
	COPY(dptr+2, sptr); sptr += dim;
	COPY(dptr+3, sptr); sptr += dim;
	COPY(dptr+4, sptr); sptr += dim;
	COPY(dptr+5, sptr); sptr += dim;
	COPY(dptr+6, sptr); sptr += dim;
	COPY(dptr+7, sptr); sptr += dim;
	COPY(dptr+8, sptr); sptr += dim;
	COPY(dptr+9, sptr); sptr += dim;
	COPY(dptr+10, sptr); sptr += dim;
	COPY(dptr+11, sptr); sptr += dim;
	COPY(dptr+12, sptr); sptr += dim;
	COPY(dptr+13, sptr); sptr += dim;
	COPY(dptr+14, sptr); sptr += dim;
	COPY(dptr+15, sptr);
	/* Now go back up */
	dptr += dim; sptr++;
	COPY(dptr+15, sptr); sptr -= dim;
	COPY(dptr+14, sptr); sptr -= dim;
	COPY(dptr+13, sptr); sptr -= dim;
	COPY(dptr+12, sptr); sptr -= dim;
	COPY(dptr+11, sptr); sptr -= dim;
	COPY(dptr+10, sptr); sptr -= dim;
	COPY(dptr+9, sptr); sptr -= dim;
	COPY(dptr+8, sptr); sptr -= dim;
	COPY(dptr+7, sptr); sptr -= dim;
	COPY(dptr+6, sptr); sptr -= dim;
	COPY(dptr+5, sptr); sptr -= dim;
	COPY(dptr+4, sptr); sptr -= dim;
	COPY(dptr+3, sptr); sptr -= dim;
	COPY(dptr+2, sptr); sptr -= dim;
	COPY(dptr+1, sptr); sptr -= dim;
	COPY(dptr, sptr);
      }
    }
  } else {
    for (i = dim-16; i >= 0; i-=16) {
      for (j = 0; j < dim; j+=2) {
	int *dptr = dest+RIDX(j,i,dim);
	int *sptr = src+RIDX(i,j,dim);
	COPY(dptr, sptr); sptr += dim;
	COPY(dptr+1, sptr); sptr += dim;
	COPY(dptr+2, sptr); sptr += dim;
	COPY(dptr+3, sptr); sptr += dim;
	COPY(dptr+4, sptr); sptr += dim;
	COPY(dptr+5, sptr); sptr += dim;
	COPY(dptr+6, sptr); sptr += dim;
	COPY(dptr+7, sptr); sptr += dim;
	COPY(dptr+8, sptr); sptr += dim;
	COPY(dptr+9, sptr); sptr += dim;
	COPY(dptr+10, sptr); sptr += dim;
	COPY(dptr+11, sptr); sptr += dim;
	COPY(dptr+12, sptr); sptr += dim;
	COPY(dptr+13, sptr); sptr += dim;
	COPY(dptr+14, sptr); sptr += dim;
	COPY(dptr+15, sptr);
	/* Now go back up */
	dptr += dim; sptr++;
	COPY(dptr+15, sptr); sptr -= dim;
	COPY(dptr+14, sptr); sptr -= dim;
	COPY(dptr+13, sptr); sptr -= dim;
	COPY(dptr+12, sptr); sptr -= dim;
	COPY(dptr+11, sptr); sptr -= dim;
	COPY(dptr+10, sptr); sptr -= dim;
	COPY(dptr+9, sptr); sptr -= dim;
	COPY(dptr+8, sptr); sptr -= dim;
	COPY(dptr+7, sptr); sptr -= dim;
	COPY(dptr+6, sptr); sptr -= dim;
	COPY(dptr+5, sptr); sptr -= dim;
	COPY(dptr+4, sptr); sptr -= dim;
	COPY(dptr+3, sptr); sptr -= dim;
	COPY(dptr+2, sptr); sptr -= dim;
	COPY(dptr+1, sptr); sptr -= dim;
	COPY(dptr, sptr);
      }
    }
  }
}

void toggle16x2_convert(int *dest, int dim)
{
  static int sense = 0;
  int i, j;
  int *src = dest;
  sense = 1-sense; /* Toggle for next time */
  sense =1;
  if (sense) {
    for (i = 0; i < dim; i+=16) {
      for (j = i; j < dim; j+=2) {
        int *dptr = dest+RIDX(j,i,dim);
        int *sptr = src+RIDX(i,j,dim);
        T_COPY(dptr, sptr); sptr += dim;
        T_COPY(dptr+1, sptr); sptr += dim;
        T_COPY(dptr+2, sptr); sptr += dim;
        T_COPY(dptr+3, sptr); sptr += dim;
        T_COPY(dptr+4, sptr); sptr += dim;
        T_COPY(dptr+5, sptr); sptr += dim;
        T_COPY(dptr+6, sptr); sptr += dim;
        T_COPY(dptr+7, sptr); sptr += dim;
        T_COPY(dptr+8, sptr); sptr += dim;
        T_COPY(dptr+9, sptr); sptr += dim;
        T_COPY(dptr+10, sptr); sptr += dim;
        T_COPY(dptr+11, sptr); sptr += dim;
        T_COPY(dptr+12, sptr); sptr += dim;
        T_COPY(dptr+13, sptr); sptr += dim;
        T_COPY(dptr+14, sptr); sptr += dim;
        T_COPY(dptr+15, sptr);
        /* Now go back up */
        dptr += dim; sptr++;
        T_COPY(dptr+15, sptr); sptr -= dim;
        T_COPY(dptr+14, sptr); sptr -= dim;
        T_COPY(dptr+13, sptr); sptr -= dim;
        T_COPY(dptr+12, sptr); sptr -= dim;
        T_COPY(dptr+11, sptr); sptr -= dim;
        T_COPY(dptr+10, sptr); sptr -= dim;
        T_COPY(dptr+9, sptr); sptr -= dim;
        T_COPY(dptr+8, sptr); sptr -= dim;
        T_COPY(dptr+7, sptr); sptr -= dim;
        T_COPY(dptr+6, sptr); sptr -= dim;
        T_COPY(dptr+5, sptr); sptr -= dim;
        T_COPY(dptr+4, sptr); sptr -= dim;
        T_COPY(dptr+3, sptr); sptr -= dim;
        T_COPY(dptr+2, sptr); sptr -= dim;
        T_COPY(dptr+1, sptr); sptr -= dim;
        T_COPY(dptr, sptr);
      }
    }
  } else {
    for (i = dim-16; i >= 0; i-=16) {
      for (j = 0; j < dim; j+=2) {
        int *dptr = dest+RIDX(j,i,dim);
        int *sptr = src+RIDX(i,j,dim);
        T_COPY(dptr, sptr); sptr += dim;
        T_COPY(dptr+1, sptr); sptr += dim;
        T_COPY(dptr+2, sptr); sptr += dim;
        T_COPY(dptr+3, sptr); sptr += dim;
        T_COPY(dptr+4, sptr); sptr += dim;
        T_COPY(dptr+5, sptr); sptr += dim;
        T_COPY(dptr+6, sptr); sptr += dim;
        T_COPY(dptr+7, sptr); sptr += dim;
        T_COPY(dptr+8, sptr); sptr += dim;
        T_COPY(dptr+9, sptr); sptr += dim;
        T_COPY(dptr+10, sptr); sptr += dim;
        T_COPY(dptr+11, sptr); sptr += dim;
        T_COPY(dptr+12, sptr); sptr += dim;
        T_COPY(dptr+13, sptr); sptr += dim;
        T_COPY(dptr+14, sptr); sptr += dim;
        T_COPY(dptr+15, sptr);
        /* Now go back up */
        dptr += dim; sptr++;
        T_COPY(dptr+15, sptr); sptr -= dim;
        T_COPY(dptr+14, sptr); sptr -= dim;
        T_COPY(dptr+13, sptr); sptr -= dim;
        T_COPY(dptr+12, sptr); sptr -= dim;
        T_COPY(dptr+11, sptr); sptr -= dim;
        T_COPY(dptr+10, sptr); sptr -= dim;
        T_COPY(dptr+9, sptr); sptr -= dim;
        T_COPY(dptr+8, sptr); sptr -= dim;
        T_COPY(dptr+7, sptr); sptr -= dim;
        T_COPY(dptr+6, sptr); sptr -= dim;
        T_COPY(dptr+5, sptr); sptr -= dim;
        T_COPY(dptr+4, sptr); sptr -= dim;
        T_COPY(dptr+3, sptr); sptr -= dim;
        T_COPY(dptr+2, sptr); sptr -= dim;
        T_COPY(dptr+1, sptr); sptr -= dim;
        T_COPY(dptr, sptr);
      }
    }
  }
}



char unroll32_descr[] = "Unrolled 32 Rows";
void unroll32_transpose(int *dest, int *src, int dim)
{
  int i, j;
  for (i = 0; i < dim; i+=32) {
    for (j = 0; j < dim; j++) {
      int *dptr = dest+RIDX(j,i,dim);
      int *sptr = src+RIDX(i,j,dim);
      COPY(dptr, sptr); sptr += dim;
      COPY(dptr+1, sptr); sptr += dim;
      COPY(dptr+2, sptr); sptr += dim;
      COPY(dptr+3, sptr); sptr += dim;
      COPY(dptr+4, sptr); sptr += dim;
      COPY(dptr+5, sptr); sptr += dim;
      COPY(dptr+6, sptr); sptr += dim;
      COPY(dptr+7, sptr); sptr += dim;
      COPY(dptr+8, sptr); sptr += dim;
      COPY(dptr+9, sptr); sptr += dim;
      COPY(dptr+10, sptr); sptr += dim;
      COPY(dptr+11, sptr); sptr += dim;
      COPY(dptr+12, sptr); sptr += dim;
      COPY(dptr+13, sptr); sptr += dim;
      COPY(dptr+14, sptr); sptr += dim;
      COPY(dptr+15, sptr); sptr += dim;
      COPY(dptr+16, sptr); sptr += dim;
      COPY(dptr+17, sptr); sptr += dim;
      COPY(dptr+18, sptr); sptr += dim;
      COPY(dptr+19, sptr); sptr += dim;
      COPY(dptr+20, sptr); sptr += dim;
      COPY(dptr+21, sptr); sptr += dim;
      COPY(dptr+22, sptr); sptr += dim;
      COPY(dptr+23, sptr); sptr += dim;
      COPY(dptr+24, sptr); sptr += dim;
      COPY(dptr+25, sptr); sptr += dim;
      COPY(dptr+26, sptr); sptr += dim;
      COPY(dptr+27, sptr); sptr += dim;
      COPY(dptr+28, sptr); sptr += dim;
      COPY(dptr+29, sptr); sptr += dim;
      COPY(dptr+30, sptr); sptr += dim;
      COPY(dptr+31, sptr);
    }
  }
}

char toggle32_descr[] = "Unrolled 32 Rows, toggle";
void toggle32_transpose(int *dest, int *src, int dim)
{
  static int sense = 0;
  int i, j;
  if (sense) {
    for (i = 0; i < dim; i+=32) {
      for (j = dim-1; j >= 0; j-=1) {
	int *dptr = dest+RIDX(j,i,dim);
	int *sptr = src+RIDX(i,j,dim);
	COPY(dptr, sptr); sptr += dim;
	COPY(dptr+1, sptr); sptr += dim;
	COPY(dptr+2, sptr); sptr += dim;
	COPY(dptr+3, sptr); sptr += dim;
	COPY(dptr+4, sptr); sptr += dim;
	COPY(dptr+5, sptr); sptr += dim;
	COPY(dptr+6, sptr); sptr += dim;
	COPY(dptr+7, sptr); sptr += dim;
	COPY(dptr+8, sptr); sptr += dim;
	COPY(dptr+9, sptr); sptr += dim;
	COPY(dptr+10, sptr); sptr += dim;
	COPY(dptr+11, sptr); sptr += dim;
	COPY(dptr+12, sptr); sptr += dim;
	COPY(dptr+13, sptr); sptr += dim;
	COPY(dptr+14, sptr); sptr += dim;
	COPY(dptr+15, sptr); sptr += dim;
	COPY(dptr+16, sptr); sptr += dim;
	COPY(dptr+17, sptr); sptr += dim;
	COPY(dptr+18, sptr); sptr += dim;
	COPY(dptr+19, sptr); sptr += dim;
	COPY(dptr+20, sptr); sptr += dim;
	COPY(dptr+21, sptr); sptr += dim;
	COPY(dptr+22, sptr); sptr += dim;
	COPY(dptr+23, sptr); sptr += dim;
	COPY(dptr+24, sptr); sptr += dim;
	COPY(dptr+25, sptr); sptr += dim;
	COPY(dptr+26, sptr); sptr += dim;
	COPY(dptr+27, sptr); sptr += dim;
	COPY(dptr+28, sptr); sptr += dim;
	COPY(dptr+29, sptr); sptr += dim;
	COPY(dptr+30, sptr); sptr += dim;
	COPY(dptr+31, sptr);
      }
    }
  } else {
    for (i = dim-32; i >= 0; i-=32) {
      for (j = 0; j < dim; j+=1) {
	int *dptr = dest+RIDX(j,i,dim);
	int *sptr = src+RIDX(i,j,dim);
	COPY(dptr, sptr); sptr += dim;
	COPY(dptr+1, sptr); sptr += dim;
	COPY(dptr+2, sptr); sptr += dim;
	COPY(dptr+3, sptr); sptr += dim;
	COPY(dptr+4, sptr); sptr += dim;
	COPY(dptr+5, sptr); sptr += dim;
	COPY(dptr+6, sptr); sptr += dim;
	COPY(dptr+7, sptr); sptr += dim;
	COPY(dptr+8, sptr); sptr += dim;
	COPY(dptr+9, sptr); sptr += dim;
	COPY(dptr+10, sptr); sptr += dim;
	COPY(dptr+11, sptr); sptr += dim;
	COPY(dptr+12, sptr); sptr += dim;
	COPY(dptr+13, sptr); sptr += dim;
	COPY(dptr+14, sptr); sptr += dim;
	COPY(dptr+15, sptr); sptr += dim;
	COPY(dptr+16, sptr); sptr += dim;
	COPY(dptr+17, sptr); sptr += dim;
	COPY(dptr+18, sptr); sptr += dim;
	COPY(dptr+19, sptr); sptr += dim;
	COPY(dptr+20, sptr); sptr += dim;
	COPY(dptr+21, sptr); sptr += dim;
	COPY(dptr+22, sptr); sptr += dim;
	COPY(dptr+23, sptr); sptr += dim;
	COPY(dptr+24, sptr); sptr += dim;
	COPY(dptr+25, sptr); sptr += dim;
	COPY(dptr+26, sptr); sptr += dim;
	COPY(dptr+27, sptr); sptr += dim;
	COPY(dptr+28, sptr); sptr += dim;
	COPY(dptr+29, sptr); sptr += dim;
	COPY(dptr+30, sptr); sptr += dim;
	COPY(dptr+31, sptr);
      }
    }

  }
  sense = 1-sense; /* Toggle for next time */
}

/*** This is the code you need to write for Part I ***/
char good_cache_descr[] = "Optimized for Cache Performance";
void good_cache_transpose(int *dest, int *src, int dim)
{
  static int toggle = 1;
  quado_transpose_helper(dest, src, dim, dim, 8, toggle);
  toggle = 1-toggle;
}

void good_cache_transpose_arith(int *dest, int *src, int dim)
{
  static int toggle = 1;
  quado_transpose_helper_arith(dest, src, dim, dim, 8, toggle);
  toggle = 1-toggle;
}


/*** This is the code you need to write for Part II ***/
char good_throughput_descr[] = "Optimized for throughput";
void good_throughput_transpose(int *dest, int *src, int dim)
{
  if (dim <= 128)
    toggle32_transpose(dest, src, dim);
  else
    toggle16x2_transpose(dest, src, dim);
}

/*
void good_throughput_transpose_arith(int *dest, int *src, int dim)
{
  if (dim <= 128)
    toggle32_transpose_arith(dest, src, dim);
  else
    toggle16x2_transpose_arith(dest, src, dim);
}
*/


/*
void register_transposers(void)
{
  add_transposer(row_transpose, row_descr);
  add_transposer(col_transpose, col_descr);
*/
  /*  add_transposer(col4_transpose, col4_descr); */
/*
  add_transposer(row8_transpose, row8_descr);
  add_transposer(unroll8_transpose, unroll8_descr);
  add_transposer(unroll16_transpose, unroll16_descr);
  add_transposer(toggle16_transpose, toggle16_descr);  
  add_transposer(toggle16x2_transpose, toggle16x2_descr);  
  add_transposer(unroll32_transpose, unroll32_descr);
  add_transposer(toggle32_transpose, toggle32_descr);  
  add_transposer(good_cache_transpose, good_cache_descr);
  add_transposer(good_throughput_transpose, good_throughput_descr);
}
*/



