c c A data-parallel implementation of Okutomi/Kanade multiple baseline stereo c with data distributions and column-summing tricks borrowed from Jon c Webb's Adapt implementation. c c c #define NUMPROCS 64 #define ROWS 240 #define COLS 256 #define ISIZE 37 #define JSIZE 37 #define TARDIS 12 #define ITERS 10 #define MAXDISP 16 #define WINDOW 13 program datapar integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) real dim(ROWS+WINDOW,COLS) template t1(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), dim(i,j) with t1(i) distribute t1(block(NUMPROCS)) real cbe(ROWS,COLS) integer cbd(ROWS,COLS) c c The alignment and distribution looks fishy, but works fast. It simply c aligns cbe and cbd such that minimal redistribution will be done when c they are used in generrimg()'s PDOs. In general, for the iWarp Fx c compiler, it is a good idea to make sure that the blocksizes for c distributions in main should be equal for arrays that the PDO will c redistribute with overlapping. For example, 256/64=4 rows per proc and c 256+(WINDOW=13)/64=5 rows per proc, for the obvious distributions of c (cbe, cbd), and (ref,m1,m2,dim). However, the PDO in generrimg will c redistribute dim overlapped, so making cbe and cbd have similar distribution c from the start is a big win. c c template t2(ROWS+WINDOW) align cbe(i,j), cbd(i,j) with t2(i) distribute t2(block(NUMPROCS)) integer i, j, k external timer_start external timer_stop integer timer_stop integer count, curdisp integer errortime(16), difftime(16), totaltime integer getdatatime integer alltime call timer_start() do count=1,ITERS call getdata(ref,m1,m2) cbe = 99999999.0 cbd = 0 do curdisp=0,MAXDISP-1 call gendiffimg(ref,m1,m2,dim,curdisp) call generrorimg(dim, cbe, cbd, curdisp) enddo c call testdata(cbd) enddo alltime=timer_stop() print *, "Iterations=",ITERS," on (", $ ROWS,",",COLS,") Image - ", $ alltime," clocks (",alltime*0.00005/ITERS," ms)" stop end subroutine gendiffimg(ref, m1, m2, dim, curdisp) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) real dim(ROWS+WINDOW,COLS) integer curdisp template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), dim(i,j) with t2(i) distribute t2(block(NUMPROCS)) nocheck ref, m1, m2, dim integer i, j pdo i=1,ROWS+WINDOW pin ref(i,:),m1(i,:),m2(i,:) pout dim(i,:) do j=1,COLS-(3*curdisp) dim(i,j) = (ref(i,j) - m1(i,j+curdisp))**2 + $ (ref(i,j) - m2(i,j+2*curdisp))**2 enddo endpdo return end subroutine generrorimg(dim, cbe, cbd, curdisp) real dim(ROWS+WINDOW,COLS) real cbe(ROWS,COLS) integer cbd(ROWS,COLS), curdisp template t2(ROWS+WINDOW) align dim(i,j) with t2(i) distribute t2(block(NUMPROCS)) template t1(ROWS+WINDOW) align cbe(i,j), cbd(i,j) with t1(i) distribute t1(block(NUMPROCS)) nocheck dim, cbe, cbd, curdisp real csum(COLS) real sum integer i, j, k logical flag flag=.true. pdo i=1,ROWS pin dim(i:i+WINDOW,:) pinout cbe(i,:),cbd(i,:) pbody if (flag) then do j=1,COLS csum(j) = 0.0 do k=0,WINDOW-1 csum(j) = csum(j) + dim(i+k,j) enddo enddo flag=.false. endif sum = 0.0 do j=1,7 sum = sum + csum(j) enddo if (sum .lt. cbe(i,1)) then cbe(i,1) = sum cbd(i,1) = curdisp endif do j=2,7 sum = sum + csum(j+6) if (sum .lt. cbe(i,j)) then cbe(i,j) = sum cbd(i,j) = curdisp endif enddo do j=8,COLS-6 sum = sum - csum(j-7) sum = sum + csum(j+6) if (sum .lt. cbe(i,j)) then cbe(i,j) = sum cbd(i,j) = curdisp endif enddo do j=COLS-5,COLS sum = sum - csum(j-7) if (sum .lt. cbe(i,j)) then cbe(i,j) = sum cbd(i,j) = curdisp endif enddo do j=1,COLS csum(j) = csum(j) - dim(i,j) + $ dim(i+WINDOW,j) enddo endpdo return end c c This routine currently just builds a standard image that testdata c can check. c c original by LeeAnn Tzeng c parallelized and updated by Peter A. Dinda c c subroutine getdata(ref,m1,m2) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j) with t2(i) distribute t2(block(NUMPROCS)) integer i, j, ipos,jpos pdo i=1,6 pout ref(i,:),m1(i,:),m2(i,:) pbody do j=1,COLS ref(i,j)=0 m1(i,j)=0 m2(i,j)=0 enddo endpdo pdo i=1,ROWS+WINDOW pout ref(i,:),m1(i,:),m2(i,:) pbody do j=1,COLS ref(i,j) =(i + j) m1(i,j) = (i + j) m2(i,j) = (i + j) enddo endpdo pdo i=ROWS+7,ROWS+WINDOW pout ref(i,:),m1(i,:),m2(i,:) pbody do j=1,COLS ref(i,j)=0 m1(i,j)=0 m2(i,j)=0 enddo endpdo ipos=(ROWS-ISIZE)/2 + 6 jpos=((COLS-JSIZE)/2) - TARDIS pdo i=ipos,ipos+ISIZE-1 pout ref(i,:),m1(i,:),m2(i,:) pbody do j=jpos,jpos+JSIZE-1 ref(i,j) = ((i-ipos+1) + (j-jpos+1)) m1(i,j+TARDIS) = ((i-ipos+1) + (j-jpos+1)) m2(i,j+2*TARDIS) = ((i-ipos+1) + (j-jpos+1)) enddo endpdo return end c c Routine to test output (matched with input routine above) c c original by LeeAnn Tzeng c cleaned up by Peter A. Dinda c subroutine testdata(disp) integer disp(ROWS,COLS) template t1(ROWS+WINDOW) align disp(i,j) with t1(i) distribute t1(block(NUMPROCS)) integer ipos, jpos, i, j ipos=(ROWS-ISIZE)/2 jpos=((COLS-JSIZE)/2) - TARDIS do i=ipos+10,ipos+ISIZE-11 do j=jpos+10,jpos+JSIZE-11 if (disp(i,j).ne.TARDIS) then write (*,100) i,j,TARDIS,disp(i,j) end if end do end do do i=1,ipos-10 do j=1,jpos+JSIZE+2*TARDIS+9 if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do do i=1,ipos+ISIZE+9 do j=jpos+JSIZE+2*TARDIS+9,COLS if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do do i=ipos+ISIZE+9,ROWS do j=jpos-10,COLS if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do do i=ipos-10,ROWS do j=1,jpos-10 if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do 100 format("Error at position (",I3,",",I3,") - expected disp ",I, $ " but got ",I) return end