c
c     Fx block matrix multiply program 
c
c     Performs a sequence of matrix multiplications c = c + a*b.
c     Each iteration, it checks the results for correctness and
c     prints a message if there are any problems. At the end, 
c     it reports performance in stdout0000.
c     
      program main
      include 'mm.h'
      real a(m,n),b(n,m),c(m,m)
      real v(m,(m/chunks))

      integer i, j, k, jp, iter
      integer pclocks, cclocks
      real secs, mflops, flops, efficiency, real

      external timer_stop	
      integer timer_stop
      
      template t(m)	
      align a(i,j) with t(i)
      align b(i,j) with t(i)
      align c(i,j) with t(i)
      distribute t(block(64))

      call fx_sync()

c
c     main loop
c
      pclocks = 0
      cclocks = 0

      do iter=1,iters
	 call inita(a)
	 call initb(b,5.0)
         c = 0.0

         do jp=0,chunks-1
            call timer_start()
            v=b(:,(m/chunks)*jp+1:(m/chunks)*(jp+1))
            cclocks = cclocks + timer_stop()
            call timer_start()
            pdo i=1,m
            pin a(i,:),c(i,:)
            pout c(i,:)
               do j=(m/chunks)*jp+1 , (m/chunks)*(jp+1)
                  do k=1,n 
                     c(i,j) = c(i,j)+a(i,k)*v(k,j-(m/chunks)*jp)
                  enddo 
               enddo
            endpdo
            pclocks = pclocks + timer_stop()
         enddo
	 call verify(c,5.0)
      enddo	

c
c     print performance results
c
      pclocks = pclocks/iters
      cclocks = cclocks/iters
      efficiency = (real(pclocks)/real(pclocks+cclocks))*100.0
      print *, 'comp=', pclocks,' comm=', cclocks, 'efficiency= ', 
     $     efficiency
      print *,''

      secs = pclocks+cclocks
      secs = secs/20000000.0
      flops = n*n*n*2.0
      mflops = flops/1000000.0
      mflops = mflops/secs
      print *, 'secs  =', secs
      print *, 'flops =', flops
      print *, 'mflops=', mflops
            
      end
      
c
c     check results
c
      subroutine verify (c,f)
      include 'mm.h'
      real f
      integer i,j
      integer count
      real c(m,m)
      template t(m)
      align c(i,j) with t(i)
      distribute t(block(64))
      
      count = 0
      pdo i=1,m
      pin c(i,:)
         do j=1,n
            if (c(i,j).ne.(i*j*f)) then
               count = count + 1
            endif
         enddo
      endpdo
      
      if (count .ne. 0) print *,count
      end    
      
c
c     initialize things
c      
      subroutine inita(a)  
      include 'mm.h'
      integer i,j
      real a(m,n)

      template t(m)
      align a(i,j) with t(i)
      distribute t(block(64))
      
      pdo i=1,m
      pin a(i,:)
      pout a(i,:)
         do j=1,n
            a(i,j)=i*j*1.0
         enddo
      endpdo	
      end
      
      subroutine initb(b,f)  
      include 'mm.h'
      real f, b(m,n)
      integer i,j

      template t(m)
      align b(i,j) with t(i)
      distribute t(block(64))

      b=0.0
      
      do i=1,m
         do j=1,n
            if (i.eq.j) then 
               b(i,j)=f
            endif
         enddo
      enddo	
      end
