c c fftc2.fx - 2D fast Fourier transform c c Comments and suggestions to Dave O'Hallaron, droh@cs.cmu.edu cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c Copyright (c) 1994 by Carnegie Mellon University c c Permission to use, copy, modify, and distribute this software c for any purpose and without fee is hereby granted, provided that c the above copyright notice appear in all copies. cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc #define N 512 #define NDV2 N/2 #define LOGN 9 #define ITERS 20 program fftc2 complex a(N,N) ! Input/Output array complex b(N,N) ! Intermediate array c precomputed constants integer brt(N) ! bit reverse table for 1d fft's complex w(N/2) ! twiddles for 1d fft's c timing variables integer timer_stop external timer_stop integer t1clocks,t2clocks,f1clocks,f2clocks c misc integer k c c precompute the input array and fft constants c template t(N) align a(j,k) with t(k) align b(j,k) with t(k) distribute t(block(64)) call fx_sync() call gen_bit_reverse_table(brt) call gen_w_table(w) c c begin the 2d fft program c t1clocks = 0 t2clocks = 0 f1clocks = 0 f2clocks = 0 do k=1,ITERS call dgen(a) ! generate input data call timer_start() call cffts(a,brt,w) ! step 1: Column FFT f1clocks = f1clocks + timer_stop() call timer_start() idxperm(b,2,1) = a ! step 2: Transpose t1clocks = t1clocks + timer_stop() call timer_start() call cffts(b,brt,w) ! step 3: Column FFT f2clocks = f2clocks + timer_stop() call timer_start() idxperm(a,2,1) = b ! step 4: Transpose t2clocks = t2clocks + timer_stop() call chkmat(a) ! check output data enddo t1clocks = t1clocks/ITERS t2clocks = t2clocks/ITERS f1clocks = f1clocks/ITERS f2clocks = f2clocks/ITERS call prperf(t1clocks,t2clocks,f1clocks,f1clocks) ! Print Performance Results end c c dgen - initialize the vector with a point source c subroutine dgen(a) complex a(N,N), cmplx intrinsic cmplx template t(N) align a(j,k) with t(k) distribute t(block(64)) a = cmplx(0.,0.) a(N/2+1,N/2+1) = cmplx(N,N) end c c gen_bit_reverse_table - initialize bit reverse table c c Postcondition: br(i) = bit-reverse(i-1) + 1 c subroutine gen_bit_reverse_table(brt) integer brt(N) integer i, j, k, nv2 nv2 = N/2 j = 1 brt(1) = j do i = 2, N k = nv2 6 continue if (k .lt. j) then j = j - k k = k/2 goto 6 endif j = j + k brt(i) = j enddo return end c c gen_w_table: generate powers of w. c c postcondition: w(i) = w**(i-1) c subroutine gen_w_table(w) complex w(NDV2), cmplx integer i real pi real wr,wi real ptr, pti real aimag, real, cos, sin intrinsic aimag, real, cos, sin, cmplx pi = 3.141592653589793 wr = cos(pi/NDV2) wi = sin(pi/NDV2) w(1) = cmplx(1.0,0.0) ptr = 1.0 pti = 0.0 do i = 2,NDV2 w(i) = cmplx(ptr*wr - pti*wi, ptr*wi + pti*wr) ptr = real(w(i)) pti = aimag(w(i)) enddo return end c c cffts - perform a 1d fft on each column c subroutine cffts(a,brt,W) integer brt(N) complex a(N,N),w(NDV2) integer col template t(N) align a(j,k) with t(k) distribute t(block(64)) pdo col=1,N pin a(:,col) pout a(:,col) call fft(a(1,col),brt,w) endpdo return end c c Fast Fourier Transform c 1D in-place complex-complex decimation-in-time Cooley-Tukey c subroutine fft(a, brt, W) real a(2,N), W(2,NDV2) integer brt(N) integer i,j integer powerOfW integer sPowerOfW integer ijDiff integer stage integer stride integer first real pwr,pwi, tr, ti real ir, ii, jr, ji nocheck c c bit reverse step c do i = 1,N j = brt(i) if (i .lt. j) then tr = a(1,j) ti = a(2,j) a(1,j) = a(1,i) a(2,j) = a(2,i) a(1,i) = tr a(2,i) = ti endif enddo c c butterfly computations c ijDiff = 1 stride = 2 sPowerOfW = NDV2 do stage = 1,LOGN c Invariant: stride = 2 ** stage c Invariant: ijDiff = 2 ** (stage - 1) first = 1 do powerOfW = 1, NDV2, sPowerOfW pwr = W(1,powerOfW) pwi = W(2,powerOfW) c Invariant: pwr + sqrt(-1)*pwi = W**(powerOfW - 1) do i = first, N, stride j = i + ijDiff jr = a(1,j) ji = a(2,j) ir = a(1,i) ii = a(2,i) tr = jr*pwr - ji*pwi ti = jr*pwi + ji*pwr a(1,j) = ir - tr a(2,j) = ii - ti a(1,i) = ir + tr a(2,i) = ii + ti enddo first = first + 1 enddo ijDiff = stride stride = stride * 2 sPowerOfW = sPowerOfW / 2 enddo return end c c chkmat - check the output matrix for correctness c subroutine chkmat(a) complex a(N,N) integer j,k integer sign,errors real epsilon, aimag, real intrinsic aimag, real template t(N) align a(j,k) with t(k) distribute t(block(64)) epsilon = 0.0001 errors = 0 pdo k=1,N pin a(:,k) sign = 1 if ( (k/2)*2 .eq. k ) sign = (-1) ! if k = even, sign = -1 do j=1,N if (real(a(j,k)) .gt. N*sign+epsilon) errors = errors + 1 if (real(a(j,k)) .lt. N*sign-epsilon) errors = errors + 1 if (aimag(a(j,k)) .gt. N*sign+epsilon) errors = errors + 1 if (aimag(a(j,k)) .lt. N*sign-epsilon) errors = errors + 1 sign = sign * (-1) enddo endpdo if (errors .gt. 0) then print *, 'errors = ', errors endif end c c prperf - print FFT performance c subroutine prperf(t1clocks,t2clocks,f1clocks,f2clocks) integer t1clocks,t2clocks,f1clocks,f2clocks real secs,tsecs,fsecs,flops,mflops,eff integer tpercent,fpercent tsecs = (t1clocks+t2clocks)/2.e7 fsecs = (f1clocks+f2clocks)/2.e7 secs = tsecs+fsecs tpercent = (tsecs/secs)*100 fpercent = (fsecs/secs)*100 flops = N*N*(2*LOGN)*5 mflops = (flops/1000000.0)/secs eff = (fsecs/secs)*100 print *, '+++++++++++++++++++++++++++++++' print *, '2D FFT: ', N, ' X ', N, ' Matrix' print *, '+++++++++++++++++++++++++++++++' print *, 'Time per input vector:' print *, " " print *, 'Step 1: fft : ',f1clocks/2.e7, 'sec ' print *, 'Step 2: transpose : ',t1clocks/2.e7, 'sec ' print *, 'Step 3: fft : ',f2clocks/2.e7, 'sec ' print *, 'Step 4: transpose : ',t2clocks/2.e7, 'sec ' print *, " " print *, 'total transpose : ',tsecs, 'sec ', tpercent, '%' print *, 'total ffts : ',fsecs, 'sec ', fpercent, '%' print *, 'total(s) : ',secs, 'sec' print *, " " print *, 'mflop/s : ',mflops print *, 'efficiency : ',eff print *, " " end