#include <stdio.h>
#include "tplib.h"

extern double *shmem_init();

static int tp_schedule[1024];
static int tp_init=0;
static int tp_cid;
static int tp_npes;
static int tp_lbpes;
static int tp_mode;

static int mask(n,p,b) 
    unsigned int n,p,b; 
{   
    return ((n>>p) & ((1<<b)-1));
}
    
static int perm3[8][8]={{1,4,7,2,5,0,3,6},{7,2,5,0,3,6,1,4},
                 {5,0,3,6,1,4,7,2},{3,6,1,4,7,2,5,0},
                 {2,7,4,1,6,3,0,5},{6,3,0,5,2,7,4,1},
                 {4,5,2,3,0,1,6,7},{0,1,6,7,4,5,2,3}};
static int perm2[4][4]={{1,3,0,2},{2,0,3,1},{3,1,2,0},{0,2,1,3}};
static int perm1[2][2]={{0,1},{1,0}};
static int lb[17]={-1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4};

static int perm(bits,phase,cid) 
    int bits,phase,cid;
{
    if (bits==1) return(perm1[phase][cid]);
    if (bits==2) return(perm2[phase][cid]);
    if (bits==3) return(perm3[phase][cid]);
}

int tp_lb(x) 
   int x;
{ 
   int xx,xy,lx;

   xx=x; lx=0; xy=1;
   while(( xx >>= 1) > 0) {lx++; xy=xy*2;}
   if (xy==x) return(lx); else return(-1);
}

int tp_transpose_mode(mode) 
   int mode;
{
   tp_mode=mode; 
}

int tp_transpose_init() 
{
   int nodes[1024];
   int w,x,y,z;
   int xp,yp,zp,wp;
   int xb,yb,zb,wb;
   int i,j,ok,dst;

   tp_mode=0; 

   PSHAPE(&x,&y,&z);
   tp_cid=_my_pe();
   tp_npes=_num_pes();
   w=2; 
   x=x/2;
   wp=0;wb=lb[w];
   xp=wb;xb=lb[x];
   yp=xp+xb;yb=lb[y];
   zp=yp+yb;zb=lb[z];

   if ((tp_lbpes=tp_lb(tp_npes)) < 0) {
      fprintf(stderr, "%d: number of PEs must be a power of 2, 1<=npes<=1024\n",tp_npes);
      exit(1);
   }

   for (i=0; i<tp_npes; i++) {
     nodes[i]=0;
   }

   for (i=0; i<tp_npes; i++) {
      dst=0;
      dst=dst+(perm(wb,mask(i,wp,wb),mask(tp_cid,wp,wb))<<wp); 
      dst=dst+(perm(xb,mask(i,xp,xb),mask(tp_cid,xp,xb))<<xp);
      dst=dst+(perm(yb,mask(i,yp,yb),mask(tp_cid,yp,yb))<<yp);
      dst=dst+(perm(zb,mask(i,zp,zb),mask(tp_cid,zp,zb))<<zp); 
      tp_schedule[i]=dst;
      nodes[dst]=1;
    }
    
    ok=1;
    for (i=0; i<tp_npes; i++) {
        ok=ok && (nodes[i]!=0);
    }

    if (!ok) {
       fprintf(stderr,"Fast transpose, internal schedule error\n");
       return(1);
    } else {
       tp_init=1;
       return(0);
    }        
}

static void tp_doitr(b,a,i,j,LR,NM) 
    double b[],a[];
    int i,j,LR,NM;    
{     
    int m,l;
    double r0,r1,r2,r3,r4,r5,r6,r7;

    j=j*(1<<LR);
    i=i*(1<<LR);
    if (LR<3) {
      for (l=0; l<(1<<LR); l++) {
        for (m=0; m<(1<<LR); m++) {
          r0=a[l*NM+j+m];
          b[m*NM+i+l]=r0;
        }
      }
    } else {
      for (l=0; l<(1<<LR); l++) {
        for (m=0; m<(1<<LR); m+=8) {
          r0=a[l*NM+j+m];
          r1=a[l*NM+j+m+1];
          r2=a[l*NM+j+m+2];
          r3=a[l*NM+j+m+3];
          r4=a[l*NM+j+m+4];
          r5=a[l*NM+j+m+5];
          r6=a[l*NM+j+m+6];
          r7=a[l*NM+j+m+7];
          b[m*NM+i+l]=r0;
          b[(m+1)*NM+i+l]=r1;
          b[(m+2)*NM+i+l]=r2;
          b[(m+3)*NM+i+l]=r3;
          b[(m+4)*NM+i+l]=r4;
          b[(m+5)*NM+i+l]=r5;
          b[(m+6)*NM+i+l]=r6;
          b[(m+7)*NM+i+l]=r7;
        }
      }
    }
}

static void tp_doitc(b,a,i,j,LR,NM) 
    doublecomplex b[],a[];
    int i,j,LR,NM;    
{     
    int m,l;
    double r0,r1,r2,r3,r4,r5,r6,r7;

    j=j*(1<<LR);
    i=i*(1<<LR);
    if (LR<2) {
      for (l=0; l<(1<<LR); l++) {
        for (m=0; m<(1<<LR); m++) {
          r0=a[l*NM+j+m].r;
          r1=a[l*NM+j+m].i;
          b[m*NM+i+l].r=r0;
          b[m*NM+i+l].i=r1;
        }
      }
    } else {
      for (l=0; l<(1<<LR); l++) {
        for (m=0; m<(1<<LR); m+=4) {
          r0=a[l*NM+j+m].r;
          r1=a[l*NM+j+m].i;
          r2=a[l*NM+j+m+1].r;
          r3=a[l*NM+j+m+1].i;
          r4=a[l*NM+j+m+2].r;
          r5=a[l*NM+j+m+2].i;
          r6=a[l*NM+j+m+3].r;
          r7=a[l*NM+j+m+3].i;
          b[m*NM+i+l].r=r0;
          b[m*NM+i+l].i=r1;
          b[(m+1)*NM+i+l].r=r2;
          b[(m+1)*NM+i+l].i=r3;
          b[(m+2)*NM+i+l].r=r4;
          b[(m+2)*NM+i+l].i=r5;
          b[(m+3)*NM+i+l].r=r6;
          b[(m+3)*NM+i+l].i=r7;
        }
      }
    }
}

int tp_transpose_complex(b,a,LN,NM) 
    doublecomplex b[],a[];
    int LN,NM;
{
    int i,j,j0,l,m,m0,o,done,dummy;
    int P,LR,LP;
    double *b0;

    if (!tp_init) {
       fprintf(stderr,"Fast transpose, must initalize first\n");
       return(1);
    }
    P=tp_npes;
    LP=tp_lbpes;
    LR=LN-LP;
    i=tp_cid;
    if (tp_mode == tp_plain) {
      barrier();
      for (j=0; j<P; j++) {
        j0=tp_schedule[j];
        b0 = shmem_init(b,a,1,j0);
        tp_doitc(b0,a,i,j0,LR,NM);
      }
    } else {
      barrier();
      for (j=0; j<P; j++) {
        j0=tp_schedule[j];
        barrier();
        b0 = shmem_init(b,a,1,j0);
        tp_doitc(b0,a,i,j0,LR,NM);
      }
    }
    barrier();
    shmem_udcflush();
}

int tp_transpose_double(b,a,LN,NM) 
    double b[],a[];
    int LN,NM;
{
    int i,j,j0,l,m,m0,o,done,dummy;
    int P,LR,LP;
    double *b0;

    if (!tp_init) {
       fprintf(stderr,"Fast transpose, must initalize first\n");
       return(1);
    }
    P=tp_npes;
    LP=tp_lbpes;
    LR=LN-LP;
    i=tp_cid;
    barrier();
    if (tp_mode == tp_plain) {
      barrier();
      for (j=0; j<P; j++) {
        j0=tp_schedule[j];
        b0 = shmem_init(b,a,1,j0);
        tp_doitr(b0,a,i,j0,LR,NM);
      }
    } else {
      barrier();
      for (j=0; j<P; j++) {
        j0=tp_schedule[j];
        barrier();
        b0 = shmem_init(b,a,1,j0);
        tp_doitr(b0,a,i,j0,LR,NM);
      }
    }
    barrier();
    shmem_udcflush();
}






