c c fftc3.fx - 3D Fast Fourier Transform c c Comments and suggestions to Dave O'Hallaron, droh@cs.cmu.edu cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc c Copyright (c) 1994 by Carnegie Mellon University c c Permission to use, copy, modify, and distribute this software c for any purpose and without fee is hereby granted, provided that c the above copyright notice appear in all copies. cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc #define N 64 #define NDV2 N/2 #define LOGN 6 #define ITERS 20 program fftc3 complex a(N,N,N) ! Input/Intermediate array complex b(N,N,N) ! Intermediate/Output array c precomputed constants integer brt(N) ! bit reverse table for 1d fft's complex w(N/2) ! twiddles for 1d fft's c timing variables integer timer_stop external timer_stop integer t1clks,t2clks,t3clks,f1clks,f2clks,f3clks c misc integer k c c precompute the input array and fft constants c template t(N) align a(i,j,k) with t(k) align b(i,j,k) with t(k) distribute t(block(64)) call fx_sync() call gen_bit_reverse_table(brt) call gen_w_table(w) c c begin the 3d fft program c t1clks = 0 t2clks = 0 t3clks = 0 f1clks = 0 f2clks = 0 f3clks = 0 do k=1,ITERS call dgen(a) ! generate input data ! a(row, col, plane) call timer_start() call cffts(a,brt,w) ! step 1: Column FFT (Column) f1clks = f1clks + timer_stop() call timer_start() ! step 2: Transpose idxperm(b,3,1,2) = idxperm(idxperm(a,2,1,3),3,1,2) t1clks = t1clks + timer_stop() ! b(col, row, plane) call timer_start() call cffts(b,brt,w) ! step 3: Column FFT (Row) f2clks = f2clks + timer_stop() call timer_start() ! step 4: Transpose idxperm(a,3,1,2) = idxperm(idxperm(b,3,1,2),3,1,2) t2clks = t2clks + timer_stop() ! a(plane, col, row) call timer_start() call cffts(a,brt,w) ! step 5: Column FFT (Plane) f3clks = f3clks + timer_stop() call timer_start() ! step 6: Transpose idxperm(b,3,1,2) = idxperm(idxperm(a,3,2,1),3,1,2) t3clks = t3clks + timer_stop() ! b(row, col, plane) call chkmat(b) ! check the results enddo t1clks = t1clks/ITERS t2clks = t2clks/ITERS t3clks = t3clks/ITERS f1clks = f1clks/ITERS f2clks = f2clks/ITERS f3clks = f3clks/ITERS call prperf(t1clks,t2clks,t3clks,f1clks,f2clks,f3clks) ! Print Performance Results end c c dgen - initialize the vector with a point source c subroutine dgen(a) complex a(N,N,N), cmplx intrinsic cmplx template t(N) align a(i,j,k) with t(k) distribute t(block(64)) a = cmplx(0.,0.) a(NDV2+1,NDV2+1,NDV2+1) = cmplx(N,N) end c c gen_bit_reverse_table - initialize bit reverse table c c Postcondition: br(i) = bit-reverse(i-1) + 1 c subroutine gen_bit_reverse_table(brt) integer brt(N) integer i, j, k j = 1 brt(1) = j do i = 2, N k = NDV2 6 continue if (k .lt. j) then j = j - k k = k/2 goto 6 endif j = j + k brt(i) = j enddo return end c c gen_w_table: generate powers of w. c c postcondition: w(i) = w**(i-1) c subroutine gen_w_table(w) complex w(NDV2), cmplx integer i real pi real wr,wi real ptr, pti real aimag, real, cos, sin intrinsic aimag, real, cos, sin, cmplx pi = 3.141592653589793 wr = cos(pi/NDV2) wi = sin(pi/NDV2) w(1) = cmplx(1.0,0.0) ptr = 1.0 pti = 0.0 do i = 2,NDV2 w(i) = cmplx(ptr*wr - pti*wi, ptr*wi + pti*wr) ptr = real(w(i)) pti = aimag(w(i)) enddo return end c c cffts - perform a 1d fft on each column of each plane c subroutine cffts(a,brt,W) integer brt(N) complex a(N,N,N),w(NDV2) integer j,k template t(N) align a(i,j,k) with t(k) distribute t(block(64)) pdo k=1,N pin a(:,:,k) pout a(:,:,k) do j=1,N call fft(a(1,j,k),brt,w) enddo endpdo return end c c Fast Fourier Transform c 1D in-place complex-complex decimation-in-time Cooley-Tukey c subroutine fft(a,brt,W) real a(2,N), W(2,NDV2) integer brt(N) integer i,j integer powerOfW integer sPowerOfW integer ijDiff integer stage integer stride integer first real pwr,pwi, tr, ti real ir, ii, jr, ji nocheck c c bit reverse step c do i = 1,N j = brt(i) if (i .lt. j) then tr = a(1,j) ti = a(2,j) a(1,j) = a(1,i) a(2,j) = a(2,i) a(1,i) = tr a(2,i) = ti endif enddo c c butterfly computations c ijDiff = 1 stride = 2 sPowerOfW = NDV2 do stage = 1,LOGN c Invariant: stride = 2 ** stage c Invariant: ijDiff = 2 ** (stage - 1) first = 1 do powerOfW = 1, NDV2, sPowerOfW pwr = W(1,powerOfW) pwi = W(2,powerOfW) c Invariant: pwr + sqrt(-1)*pwi = W**(powerOfW - 1) do i = first, N, stride j = i + ijDiff jr = a(1,j) ji = a(2,j) ir = a(1,i) ii = a(2,i) tr = jr*pwr - ji*pwi ti = jr*pwi + ji*pwr a(1,j) = ir - tr a(2,j) = ii - ti a(1,i) = ir + tr a(2,i) = ii + ti enddo first = first + 1 enddo ijDiff = stride stride = stride * 2 sPowerOfW = sPowerOfW / 2 enddo return end c c chkmat - check the output matrix for correctness c subroutine chkmat(a) complex a(N,N,N) integer i,j,k integer errors, sign real epsilon, aimag, real intrinsic aimag, real template t(N) align a(i,j,k) with t(k) distribute t(block(64)) epsilon = 0.0001 errors = 0 pdo k=1,N pin a(:,:,k) do j=1,N sign = (-1) if ( ((j+k)/2)*2 .eq. (j+k) ) sign = 1 do i=1,N if (real(a(i,j,k)) .gt. N*sign+epsilon) errors = errors + 1 if (real(a(i,j,k)) .lt. N*sign-epsilon) errors = errors + 1 if (aimag(a(i,j,k)) .gt. N*sign+epsilon) errors = errors + 1 if (aimag(a(i,j,k)) .lt. N*sign-epsilon) errors = errors + 1 sign = sign * (-1) enddo enddo endpdo if (errors .gt. 0) then print *, 'errors = ', errors endif end c c prperf - print FFT performance c subroutine prperf(t1clks,t2clks,t3clks,f1clks,f2clks,f3clks) integer t1clks,t2clks,t3clks,f1clks,f2clks,f3clks real secs,tsecs,fsecs,flops,mflops,eff integer tpercent,fpercent tsecs = (t1clks+t2clks+t3clks)/2.e7 fsecs = (f1clks+f2clks+f3clks)/2.e7 secs = tsecs+fsecs tpercent = (tsecs/secs)*100 fpercent = (fsecs/secs)*100 flops = N*N*N*(3*LOGN)*5 mflops = (flops/1000000.0)/secs eff = (fsecs/secs)*100 print *, '++++++++++++++++++++++++++++++++++' print *, '3D FFT: ', N, ' X ', N, ' X ', N, ' Matrix' print *, '++++++++++++++++++++++++++++++++++' print *, " " print *, 'Step 1: fft : ',f1clks/2.e7, 'sec ' print *, 'Step 2: transpose : ',t1clks/2.e7, 'sec ' print *, 'Step 3: fft : ',f2clks/2.e7, 'sec ' print *, 'Step 4: transpose : ',t2clks/2.e7, 'sec ' print *, 'Step 5: fft : ',f3clks/2.e7, 'sec ' print *, 'Step 6: transpose : ',t3clks/2.e7, 'sec ' print *, " " print *, 'total transpose : ',tsecs, 'sec ', tpercent, '%' print *, 'total ffts : ',fsecs, 'sec ', fpercent, '%' print *, 'total(s) : ',secs, 'sec' print *, " " print *, 'mflop/s : ',mflops print *, 'efficiency : ',eff print *, " " end