/*
 * N x N complex 2 dimensional fast Fourier transform (2DFFT)
 */
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include "tplib.h"

#define EPSILON 0.00001       /* for comparing fp numbers */
#define PI 3.14159265358979   /* 4*atan(1.0) */

#if NPES==1
#define MAXN 1024         
#define P 1
#endif
#if NPES==2
#define MAXN 1024         
#define P 2
#endif
#if NPES==4
#define MAXN 1024         
#define P 4
#endif
#if NPES==8
#define MAXN 2048         
#define P 8
#endif
#if NPES==16
#define MAXN 2048
#define P 16
#endif
#if NPES==32
#define MAXN 4096 
#define P 32
#endif
#if NPES==64
#define MAXN 4096 
#define P 64
#endif
#if NPES==128
#define MAXN 8192 
#define P 128
#endif
#if NPES==256
#define MAXN 8192 
#define P 256
#endif
#if NPES==512
#define MAXN 8192 
#define P 512
#endif

#define NM MAXN
#define NMDIM NM+4
#define RM (NM/P)

#pragma _CRI cache_align a,b,table,work

static doublecomplex a[RM][NMDIM];     /* input matrix */
static double fill[4];
static doublecomplex b[RM][NMDIM];     /* output matrix */
static double fill2[4];
static double table[2*NM+16];    /* twiddle factors */
static double work[4*NM+16];
static long isign;
static long isys;
static long n;
static double scale;
static int mypid;

main(argc,argv)
int argc;
char **argv;
{
  int nold;        /* FFT size NOW fixed at N*/
  int errors,sign; /* used for error checking */
  int flops;       /* total number of floating point ops */
  float mflops;    /* Mflops/s */
  float sflops;    /* scalar Mflops/s */
  float mbytes;    /* Mbytes/s */
  double fsecs;    /* time spent doing 1D FFT's */

  double tsecs;    /* time spent doing the transpose */
  double secs;     /* total time of the 2D FFT */
  int i,j;         /* index variables */
  int N,R,LR,LN,LP;
  int npes;
  long t0;

  /* get the FFT size and ensure that it's in range */

  mypid=_my_pe();
  npes=_num_pes();

  if (npes!=P) {
     printf("NPES expected %d, pe's alloc %d\n",P,npes);
     exit(1);
  }

  tp_transpose_init();
/*  tp_transpose_mode(tp_plain); */
  
  for (N=(P<4) ? 8 : P; N<=NM; N=N*2) {
    R=N/P;

  if ((LN=tp_lb(N)) < 0) {
    (void)fprintf(stdout, "%ld %ld: fft size must be a power of 2\n",N,LN);
    exit(0);
  }

  if ((LR=tp_lb(R)) < 0) {
    (void)fprintf(stdout, "%ld %ld: fft block must be a power of 2\n",R,LR);
    exit(0);
  }

  if ((LP=tp_lb(P)) < 0) {
    (void)fprintf(stdout, "%ld %ld: number of proc must be a power of 2\n",P,LP);
    exit(0);
  }

  /* initialize the input matrix with a centered point source */
    for (i=0; i<R; i++)
      for (j=0; j<N; j++) 
        a[i][j].r = a[i][j].i = 0.0;
    if (N/2/R==mypid) a[N/2%R][N/2].r =  a[N/2%R][N/2].i = (float)N; 
  
  isign=0;
  scale=1.0;
  isys=0;
  n=N;
  CCFFT(&isign,&n,&scale,&a[0][0],&a[0][0],table,&table[2*N+16],&isys);
  isign=1;

  /* 
   * now do the actual 2D FFT
   */

  barrier();

  /* first do a set of row FFTs */
  t0=rtclock();
    for (i=0; i<R; i++)
       CCFFT(&isign,&n,&scale,&a[i][0],&a[i][0],table,&table[2*N+16],&isys);
  fsecs = (float) (rtclock()-t0)/150000000.0;
  /* then transpose the matrix */

  barrier();  

  /* then do transpose */
  t0=rtclock();
#if P>1
  tp_transpose_complex(b,a,LN,NMDIM); 
#else
  for (i=0; i<N; i++)
    for (j=0; j<N; j++)
      b[j][i]=a[i][j];
#endif
  tsecs = (float) (rtclock()-t0)/150000000.0;

  barrier();  

  /* then do another set of row FFTs */
  t0=rtclock();
    for (i=0; i<R; i++)
       CCFFT(&isign,&n,&scale,&b[i][0],&b[i][0],table,&table[2*N],&isys);

  fsecs+= (float) (rtclock()-t0)/150000000.0;

  barrier();

  /* and transpose again*/

  t0=rtclock();
#if P>1
  tp_transpose_complex(a,b,LN,NMDIM); 
#else
  for (i=0; i<N; i++)
    for (j=0; j<N; j++)
      a[j][i]=b[i][j];
#endif
  tsecs+= (float) (rtclock()-t0)/150000000.0;
  t0=rtclock();

  /* check the answers for an alternating sequence of (+-N,+-N) */
  errors = 0;
  for (i=0; i<R; i++) {
    if (((i+mypid*R+1)/2)*2 == (i+mypid*R)) 
      sign = 1;
    else
      sign = -1;
    for (j=0; j<N; j++) {
      if (a[i][j].r > N*sign+EPSILON ||
	  a[i][j].r < N*sign-EPSILON ||
	  a[i][j].i > N*sign+EPSILON ||
	  a[i][j].i < N*sign-EPSILON) {
	errors++;
      }
      sign *= -1;
    }
  }
  if (errors) { 
    printf("%d errors!!!!!\n", errors);
    exit(0);
  }

  /* summarize the 2d FFT performance */
  if (mypid==P-1) {
  /* summarize the 2d FFT performance */
  printf("%d x %d 2D FFT, umsg, %d processors\n", N, N, P);
  secs = fsecs + tsecs;
  flops = (N*N*LN)*10/P;
  mflops = ((float)flops/1000000.0);
  mflops = mflops/(float)secs;
  sflops = ((float)flops/1000000.0);
  sflops = sflops/(float)fsecs;
  mbytes = 2*N*N*sizeof(doublecomplex) /(float)tsecs/1.0E06/P;
  printf("(pe%d)\n",pvm_get_PE(pvm_mytid()));
  printf(".1D FFTs   : %10.6f secs (%2d%%)\n", fsecs, (int)(fsecs/secs*100));
  printf(".transpose : %10.6f secs (%2d%%)\n", tsecs, (int)(tsecs/secs*100));
  printf("total      : %10.6f secs\n", secs);
  printf("\n");
  printf("%9.6f Mflop/s %9.6f Sflop/s %9.6f MBytes/sec\n", mflops,sflops,mbytes);
  printf("\n");
}
  }
  exit(0);
}






