c c fftc1.fx - 1D fast Fourier transform c c Bailey's Six-Step Method, adapted from D. Bailey, c "FFTs in External or Hierarchical Memory", c The Journal of Supercomputing, 4, 23-35 (1990). c c Comments and suggestions to Dave O'Hallaron, droh@cs.cmu.edu cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c Copyright (c) 1994 by Carnegie Mellon University c c Permission to use, copy, modify, and distribute this software c for any purpose and without fee is hereby granted, provided that c the above copyright notice appear in all copies. cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc #define N 512 #define NDV2 N/2 #define LOGN 9 #define ITERS 20 program fftc1 c array of length N1*N2 is represented by a 2D array of size N1 by N2 complex a(N,N) ! Input/Intermediate array complex b(N,N) ! Intermediate/Output arrary c precomputed constants integer brt(N) ! bit reverse table for 1d fft's complex w(N/2) ! twiddles for 1d fft's complex v(N,N) ! twiddles for 2d fft (Scale Factor) c timing variables integer timer_stop external timer_stop integer t1clocks,t2clocks,t3clocks,s1clocks,f1clocks,f2clocks c misc integer k c c precompute the input array and fft constants c template t(N) align a(j,k) with t(k) align b(j,k) with t(k) align v(j,k) with t(k) distribute t(block(64)) call fx_sync() call gen_bit_reverse_table(brt) call gen_w_table(w) call gen_v_table(v) c c begin the 1d fft program c t1clocks = 0 t2clocks = 0 t3clocks = 0 s1clocks = 0 f1clocks = 0 f2clocks = 0 do k=1,ITERS call dgen(a) ! Generate input Data call timer_start() idxperm(b,2,1) = a ! step 1: Transpose t1clocks = t1clocks + timer_stop() call timer_start() call cffts(b,brt,w) ! step 2: Column FFT f1clocks = f1clocks + timer_stop() call timer_start() call scale(b,v) ! step 3: Scale s1clocks = s1clocks + timer_stop() call timer_start() idxperm(a,2,1) = b ! step 4: Transpose t2clocks = t2clocks + timer_stop() call timer_start() call cffts(a,brt,w) ! step 5: Column FFT f2clocks = f2clocks + timer_stop() call timer_start() idxperm(b,2,1) = a ! step 6: Transpose t3clocks = t3clocks + timer_stop() call chkmat(b) ! check the results enddo t1clocks = t1clocks/ITERS t2clocks = t2clocks/ITERS t3clocks = t3clocks/ITERS s1clocks = s1clocks/ITERS f1clocks = f1clocks/ITERS f2clocks = f2clocks/ITERS call prperf(t1clocks,t2clocks,t3clocks,s1clocks,f1clocks,f2clocks) ! Print Results end c c dgen - initialize the vector with a point source c subroutine dgen(a) complex a(N,N), cmplx intrinsic cmplx template t(N) align a(j,k) with t(k) distribute t(block(64)) a = cmplx(0.,0.) a(1,NDV2+1) = cmplx(N*N,N*N) end c c gen_bit_reverse_table - initialize bit reverse table c c Postcondition: br(i) = bit-reverse(i-1) + 1 c subroutine gen_bit_reverse_table(brt) integer brt(N) integer i, j, k j = 1 brt(1) = j do i = 2, N k = NDV2 6 continue if (k .lt. j) then j = j - k k = k/2 goto 6 endif j = j + k brt(i) = j enddo return end c c gen_w_table: generate powers of w. c c postcondition: w(i) = w**(i-1) c subroutine gen_w_table(w) complex w(NDV2), cmplx integer i real pi real wr,wi real ptr, pti real aimag, real, cos, sin intrinsic aimag, real, cos, sin, cmplx pi = 3.141592653589793 wr = cos(pi/NDV2) wi = -sin(pi/NDV2) w(1) = cmplx(1.0,0.0) ptr = 1.0 pti = 0.0 do i = 2,NDV2 w(i) = cmplx(ptr*wr - pti*wi, ptr*wi + pti*wr) ptr = real(w(i)) pti = aimag(w(i)) enddo return end c c gen_v_table - gen 2d twiddle factors c subroutine gen_v_table(v) complex v(N,N), wn, cmplx real pi integer k,j real cos, sin intrinsic cos, sin, cmplx template t(N) align v(j,k) with t(k) distribute t(block(64)) pi = 3.141592653589793 wn = cmplx(cos((2.0*pi)/(N*N)),-sin((2.0*pi)/(N*N))) pdo k = 1,N pin v(:,k) pout v(:,k) do j = 1,N v(j,k) = cmplx(wn**((j-1)*(k-1))) enddo endpdo end c c scale - multiply each array element by corresponding element of v c subroutine scale(a,v) complex a(N,N),v(N,N) integer j,k template t(N) align a(j,k) with t(k) align v(j,k) with t(k) distribute t(block(64)) pdo k = 1,N pin a(:,k), v(:,k) pout a(:,k) do j = 1,N a(j,k) = a(j,k)*v(j,k) enddo endpdo end c c cffts - perform a 1d fft on each column c subroutine cffts(a,brt,W) integer brt(N) complex a(N,N),w(NDV2) integer col template t(N) align a(j,k) with t(k) distribute t(block(64)) pdo col=1,N pin a(:,col) pout a(:,col) call fft(a(1,col),brt,w) endpdo return end c c Fast Fourier Transform c 1D in-place complex-complex decimation-in-time Cooley-Tukey c subroutine fft(a,brt,W) real a(2,N), W(2,NDV2) integer brt(N) integer i,j integer powerOfW integer sPowerOfW integer ijDiff integer stage integer stride integer first real pwr,pwi, tr, ti real ir, ii, jr, ji nocheck c c bit reverse step c do i = 1,N j = brt(i) if (i .lt. j) then tr = a(1,j) ti = a(2,j) a(1,j) = a(1,i) a(2,j) = a(2,i) a(1,i) = tr a(2,i) = ti endif enddo c c butterfly computations c ijDiff = 1 stride = 2 sPowerOfW = NDV2 do stage = 1,LOGN c Invariant: stride = 2 ** stage c Invariant: ijDiff = 2 ** (stage - 1) first = 1 do powerOfW = 1, NDV2, sPowerOfW pwr = W(1,powerOfW) pwi = W(2,powerOfW) c Invariant: pwr + sqrt(-1)*pwi = W**(powerOfW - 1) do i = first, N, stride j = i + ijDiff jr = a(1,j) ji = a(2,j) ir = a(1,i) ii = a(2,i) tr = jr*pwr - ji*pwi ti = jr*pwi + ji*pwr a(1,j) = ir - tr a(2,j) = ii - ti a(1,i) = ir + tr a(2,i) = ii + ti enddo first = first + 1 enddo ijDiff = stride stride = stride * 2 sPowerOfW = sPowerOfW / 2 enddo return end c c chkmat - check the output matrix for correctness c subroutine chkmat(a) complex a(N,N) integer j,k integer sign,errors real epsilon, aimag, real intrinsic aimag, real template t(N) align a(j,k) with t(k) distribute t(block(64)) epsilon = 0.0001 errors = 0 pdo k=1,N pin a(:,k) sign = 1 do j=1,N if (real(a(j,k)) .gt. N*N*sign+epsilon) errors = errors + 1 if (real(a(j,k)) .lt. N*N*sign-epsilon) errors = errors + 1 if (aimag(a(j,k)) .gt. N*N*sign+epsilon) errors = errors + 1 if (aimag(a(j,k)) .lt. N*N*sign-epsilon) errors = errors + 1 sign = sign * (-1) enddo endpdo if (errors .gt. 0) then print *, 'errors = ', errors endif end c c prperf - print FFT performance c subroutine prperf(t1clks,t2clks,t3clks,s1clks,f1clks,f2clks) integer t1clks,t2clks,t3clks,s1clks,f1clks,f2clks real tsecs,ssecs,fsecs,secs,flops,mflops,eff integer tpercent,spercent,fpercent tsecs = (t1clks+t2clks+t3clks)/2.e7 ssecs = s1clks/2.e7 fsecs = (f1clks+f2clks)/2.e7 secs = tsecs+ssecs+fsecs tpercent = (tsecs/secs)*100 spercent = (ssecs/secs)*100 fpercent = (fsecs/secs)*100 flops = N*N*(2*LOGN)*5 mflops = (flops/1000000.0)/secs eff = (fsecs/secs)*100 print *, '+++++++++++++++++++++++' print *, '1D FFT: Points = ', N*N print *, '+++++++++++++++++++++++' print *, 'Time per input vector:' print *, " " print *, 'Step 1: transpose : ',t1clks/2.e7, 'sec ' print *, 'Step 2: fft : ',f1clks/2.e7, 'sec ' print *, 'Step 3: scale : ',s1clks/2.e7, 'sec ' print *, 'Step 4: transpose : ',t2clks/2.e7, 'sec ' print *, 'Step 5: fft : ',f2clks/2.e7, 'sec ' print *, 'Step 6: transpose : ',t3clks/2.e7, 'sec ' print *, " " print *, 'total transpose : ',tsecs, 'sec ', tpercent, '%' print *, 'total scale : ',ssecs, 'sec ', spercent, '%' print *, 'total ffts : ',fsecs, 'sec ', fpercent, '%' print *, 'total(s) : ',secs, 'sec' print *, " " print *, 'mflop/s : ',mflops print *, 'efficiency : ',eff print *, " " end