c
c     Fx stencil program
c
c     Illustrates the use of overlapped inputs in the PDO.
c
      program main
      integer iters,nx,ny,t
      parameter (iters=10)
      parameter (nx=512)
      parameter (ny=512)
      real A(nx, ny)
      real c1, c2, c3, c4, c5
      integer mytime
      integer x, y, i, j, errors
      external timer_stop
      integer timer_stop
      real left,right

c     data layout statements
      template t(nx)
      distribute t(block(64))
      align a(i,j) with t(i)

c     Initialize data and coefficients
      c1 = .25
      c2 = .25
      c3 = .25
      c4 = .25
      c5 = .25

      mytime = 0
      do t = 1, iters

c     reinitialize the input matrix
         pdo i = 1, nx
         pinout a(i,:)
            do j = 1, ny
               A(i,j) = 1.0
            enddo
         endpdo

c     the relaxation step
         call timer_start()
         pdo x = 2, nx-1
         pin A(x-1:x+1,:)
         pout A(x,:)
            A(x,2:ny-1) = c1*A(x-1, 2:ny-1) + c2 * A(x+1, 2:ny-1) +
     $           c3*A(x,1:ny-2) + c4*A(x, 3:ny) + c5*A(x,2:ny-1)
         endpdo
         mytime = mytime + timer_stop()
      enddo

c     check results 
      pdo x = 2, nx-1
      pin a(x,:)
      pmvars errors
      pinit
         errors = 0
      pbody
         do y = 2, ny-1
            if (a(x,y) .ne. 1.25) errors = errors + 1
         enddo
      pmerge
         errors = left(errors) + right(errors)
      endpdo

      print *, "errors = ", errors      
      call prperf(mytime,nx,ny,iters)

      end 

c  
c     prperf - print performance
c
      subroutine prperf(clocks,nx,ny,iters)
      integer clocks,nx,ny,iters
      real flops
      real secs,mflops

      secs = clocks
      secs = secs/20000000.0
      flops = (nx-2)*(ny-2)*9*iters
      mflops = flops/1000000.0
      mflops = mflops/secs
      print *, flops, "flops"
      print *, secs, "seconds"
      print*, mflops, "mflops"
      end
