c c A data-parallel implementation of Okutomi/Kanade multiple baseline stereo c with data distributions and column-summing tricks borrowed from Jon c Webb's Adapt implementation. c c 2 independant data parallel stereos c #define NUMPROCS 32 #define ROWS 480 #define COLS 300 #define ISIZE 37 #define JSIZE 37 #define TARDIS 12 #define ITERS 2 #define MAXDISP 512 #define WINDOW 13 program dp2 external timer_stop integer timer_stop integer count integer alltime integer x integer y call timer_start() enter tasking do count=1,ITERS call stereo(x) output(x) processor (4,8) origin(0,0) call stereo(y) output(y) processor (4,8) origin(4,0) enddo exit tasking alltime=timer_stop() call fx_sync() print *, "Iterations=",ITERS," on (",ROWS,",",COLS,") Image - ", $ alltime," clocks (", alltime*0.00005/ITERS," ms)" end subroutine stereo(x) processor (4,8) integer x integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) integer m3(ROWS+WINDOW,COLS) real dim(ROWS+WINDOW,COLS) template t1(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), m3(i,j), dim(i,j) with t1(i) distribute t1(block(NUMPROCS)) real cbe(ROWS,COLS) integer cbd(ROWS,COLS) c c The alignment and distribution looks fishy, but works fast. It simply c aligns cbe and cbd such that minimal redistribution will be done when c they are used in generrimg()'s PDOs. In general, for the iWarp Fx c compiler, it is a good idea to make sure that the blocksizes for c distributions in main should be equal for arrays that the PDO will c redistribute with overlapping. For example, 256/64=4 rows per proc and c 256+(WINDOW=13)/64=5 rows per proc, for the obvious distributions of c (cbe, cbd), and (ref,m1,m2,dim). However, the PDO in generrimg will c redistribute dim overlapped, so making cbe and cbd have similar distribution c from the start is a big win. c c template t2(ROWS+WINDOW) align cbe(i,j), cbd(i,j) with t2(i) distribute t2(block(NUMPROCS)) integer i, j, k external timer_start external timer_stop integer timer_stop integer count, curdisp integer errortime(16), difftime(16), totaltime integer getdatatime integer alltime call getdata(ref,m1,m2,m3) cbe = 99999999.0 cbd = 0 do curdisp=0,MAXDISP-1 call gendiffimg(ref,m1,m2,m3,dim,curdisp) call generrorimg(dim, cbe, cbd, curdisp) enddo c call testdata(cbd) end subroutine gendiffimg(ref, m1, m2, m3, dim, curdisp) processor (4,8) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) integer m3(ROWS+WINDOW,COLS) real dim(ROWS+WINDOW,COLS) integer curdisp template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), m3(i,j), dim(i,j) with t2(i) distribute t2(block(NUMPROCS)) nocheck ref, m1, m2, m3, dim integer i, j pdo i=1,ROWS+WINDOW pin ref(i,:),m1(i,:),m2(i,:),m3(i,:) pout dim(i,:) do j=1,COLS-(3*curdisp) dim(i,j) = (ref(i,j) - m1(i,j+curdisp))**2 + $ (ref(i,j) - m2(i,j+2*curdisp))**2 + $ (ref(i,j) - m3(i,j+3*curdisp))**2 enddo endpdo return end subroutine generrorimg(dim, cbe, cbd, curdisp) processor(4,8) real dim(ROWS+WINDOW,COLS) real cbe(ROWS,COLS) integer cbd(ROWS,COLS), curdisp template t2(ROWS+WINDOW) align dim(i,j) with t2(i) distribute t2(block(NUMPROCS)) template t1(ROWS+WINDOW) align cbe(i,j), cbd(i,j) with t1(i) distribute t1(block(NUMPROCS)) nocheck dim, cbe, cbd, curdisp real csum(COLS) real sum integer i, j, k logical flag flag=.true. pdo (i=1:ROWS) pin dim(i:i+WINDOW,:) pinout cbe(i,:),cbd(i,:) pbody if (flag) then do j=1,COLS csum(j) = 0.0 do k=0,WINDOW-1 csum(j) = csum(j) + dim(i+k,j) enddo enddo flag=.false. endif sum = 0.0 do j=1,7 sum = sum + csum(j) enddo if (sum .lt. cbe(i,1)) then cbe(i,1) = sum cbd(i,1) = curdisp endif do j=2,7 sum = sum + csum(j+6) if (sum .lt. cbe(i,j)) then cbe(i,j) = sum cbd(i,j) = curdisp endif enddo do j=8,COLS-6 sum = sum - csum(j-7) sum = sum + csum(j+6) if (sum .lt. cbe(i,j)) then cbe(i,j) = sum cbd(i,j) = curdisp endif enddo do j=COLS-5,COLS sum = sum - csum(j-7) if (sum .lt. cbe(i,j)) then cbe(i,j) = sum cbd(i,j) = curdisp endif enddo do j=1,COLS csum(j) = csum(j) - dim(i,j) + $ dim(i+WINDOW,j) enddo endpdo return end c c This routine currently just builds a standard image that testdata c can check. c c original by LeeAnn Tzeng c parallelized and updated by Peter A. Dinda c c subroutine getdata(ref,m1,m2,m3) processor (4,8) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) integer m3(ROWS+WINDOW,COLS) template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), m3(i,j) with t2(i) distribute t2(block(NUMPROCS)) integer i, j, ipos,jpos nocheck ref, m1, m2, m3 pdo i=1,6 pout ref(i,:),m1(i,:),m2(i,:),m3(i,:) pbody do j=1,COLS ref(i,j)=0 m1(i,j)=0 m2(i,j)=0 m3(i,j)=0 enddo endpdo pdo i=1,ROWS+WINDOW pout ref(i,:),m1(i,:),m2(i,:),m3(i,:) pbody do j=1,COLS ref(i,j) =(i + j) m1(i,j) = (i + j) m2(i,j) = (i + j) m3(i,j) = (i + j) enddo endpdo pdo i=ROWS+7,ROWS+WINDOW pout ref(i,:),m1(i,:),m2(i,:),m3(i,:) pbody do j=1,COLS ref(i,j)=0 m1(i,j)=0 m2(i,j)=0 m3(i,j)=0 enddo endpdo ipos=(ROWS-ISIZE)/2 + 6 jpos=((COLS-JSIZE)/2) - TARDIS pdo i=ipos,ipos+ISIZE-1 pout ref(i,:),m1(i,:),m2(i,:),m3(i,:) pbody do j=jpos,jpos+JSIZE-1 ref(i,j) = ((i-ipos+1) + (j-jpos+1)) m1(i,j+TARDIS) = ((i-ipos+1) + (j-jpos+1)) m2(i,j+2*TARDIS) = ((i-ipos+1) + (j-jpos+1)) m3(i,j+3*TARDIS) = ((i-ipos+1) + (j-jpos+1)) enddo endpdo return end c c Routine to test output (matched with input routine above) c c original by LeeAnn Tzeng c cleaned up by Peter A. Dinda c subroutine testdata(disp) processor (4,8) integer disp(ROWS,COLS) template t1(ROWS+WINDOW) align disp(i,j) with t1(i) distribute t1(block(NUMPROCS)) integer ipos, jpos, i, j nocheck disp print *, "testing data" ipos=(ROWS-ISIZE)/2 jpos=((COLS-JSIZE)/2) - TARDIS do i=ipos+10,ipos+ISIZE-11 do j=jpos+10,jpos+JSIZE-11 if (disp(i,j).ne.TARDIS) then write (*,100) i,j,TARDIS,disp(i,j) end if end do end do do i=1,ipos-10 do j=1,jpos+JSIZE+2*TARDIS+9 if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do do i=1,ipos+ISIZE+9 do j=jpos+JSIZE+2*TARDIS+9,COLS if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do do i=ipos+ISIZE+9,ROWS do j=jpos-10,COLS if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do do i=ipos-10,ROWS do j=1,jpos-10 if (disp(i,j).ne.0) then write (*,100) i,j,0,disp(i,j) end if end do end do 100 format("Error at position (",I3,",",I3,") - expected disp ",I, $ " but got ",I) return end