#define NUMPROCS 64 #define ROWS 256 #define COLS 100 #define MAXDISP 16 #define TASKBLOCKS 4 #define WINDOW 13 #define ITERS 500 #define ISIZE 37 #define JSIZE 37 #define TARDIS 12 program taskparstereo integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) real cbe(ROWS, COLS, TASKBLOCKS) integer cbd(ROWS, COLS, TASKBLOCKS) c c c Template is weird in order to improve performance c c template t1(ROWS+WINDOW) align cbe(i,j,k), cbd(i,j,k) with t1(i) distribute t1(block(10)) template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j) with t2(i) distribute t2(block(10)) integer curdisp, iteration integer time external timer_start, timer_stop integer timer_stop c c This is a workaround... c integer zero, four, eight, twelve zero=0 four=4 eight=8 twelve=12 call timer_start enter tasking do iteration=1,ITERS call getdata(ref,m1,m2) output(ref,m1,m2) processor(3,4) origin(0,0) call driver(ref, m1, m2, cbe, cbd, zero) input(ref,m1,m2) output(cbe(:,:,1), cbd(:,:,1)) processor (5,2) origin (3, 0) call driver(ref, m1, m2, cbe, cbd, four) input(ref,m1,m2) output(cbe(:,:,2), cbd(:,:,2)) processor (5,2) origin (3, 2) call driver(ref, m1, m2, cbe, cbd, eight) input(ref,m1,m2) output(cbe(:,:,3), cbd(:,:,3)) processor (5,2) origin (3, 4) call driver(ref, m1, m2, cbe, cbd, twelve) input(ref,m1,m2) output(cbe(:,:,4), cbd(:,:,4)) processor (5,2) origin (3, 6) call combandout(cbe,cbd) input(cbe(:,:,:),cbd(:,:,:)) output(cbd(:,:,1)) processor(3, 4) origin(0, 4) enddo exit tasking time=timer_stop() print *, "Completed ",ITERS," iterations on (",ROWS,",", $ COLS,") image in time=", time," clocks (", $ time*0.00005/ITERS," ms)" end c c c c c subroutine driver(ref, m1, m2, cbe, cbd, curdisp) processor (5,2) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) real cbe(ROWS, COLS, TASKBLOCKS) integer cbd(ROWS, COLS, TASKBLOCKS), curdisp real dim(ROWS+WINDOW, COLS) template t1(ROWS+WINDOW) align cbe(i,j,k), cbd(i,j,k) with t1(i) distribute t1(block(10)) template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), dim(i,j) with t2(i) distribute t2(block(10)) nocheck ref, m1, m2, cbe, cbd integer whichbest, localdisp whichbest=curdisp/(MAXDISP/TASKBLOCKS)+1 cbe(:,:,whichbest) = 9999999.0 cbd(:,:,whichbest) = curdisp do localdisp=curdisp, curdisp+(MAXDISP/TASKBLOCKS)-1 call gendiffimg(ref,m1,m2,dim,localdisp) call generrimg(dim,cbe,cbd,localdisp,whichbest) enddo end c c Take in two best error/best disp images and put the best disp c image on of the four at curbestdisp(1,:,:) c subroutine combandout(cbe,cbd) processor (3,4) real cbe(ROWS,COLS,TASKBLOCKS) integer cbd(ROWS,COLS,TASKBLOCKS) template t(ROWS+WINDOW) align cbe(i,j,k), cbd(i,j,k) with t(i) distribute t(block(12)) integer i,j,k,local integer ipos, jpos nocheck cbe, cbd pdo i=1,ROWS pin cbe(i,:,:) pinout cbd(i,:,:) pbody do j=1, COLS if (cbe(1,i,j) .gt. cbe(2,i,j)) then if (cbe(2,i,j) .gt. cbe(3,i,j)) then if (cbe(3,i,j) .gt. cbe(4,i,j)) then cbd(1,i,j)=cbd(4,i,j) else cbd(1,i,j)=cbd(3,i,j) endif else if (cbe(2,i,j) .gt. cbe(4,i,j)) then cbd(1,i,j)=cbd(4,i,j) else cbd(1,i,j)=cbd(2,i,j) endif endif else if (cbe(1,i,j) .gt. cbe(3,i,j)) then if (cbe(3,i,j) .gt. cbe(4,i,j)) then cbd(1,i,j)=cbd(4,i,j) else cbd(1,i,j)=cbd(3,i,j) endif else if (cbe(1,i,j) .gt. cbe(4,i,j)) then cbd(1,i,j)=cbd(4,i,j) else cbd(1,i,j)=cbd(2,i,j) endif endif endif enddo endpdo c print *, "testing data" c call testdata(cbd) return end subroutine gendiffimg(ref, m1, m2, dim, curdisp) processor (5,2) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) real dim(ROWS+WINDOW,COLS) integer curdisp template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j), dim(i,j) with t2(i) distribute t2(block(10)) nocheck ref, m1, m2, dim integer i, j pdo i=1,ROWS+WINDOW pin ref(i,:),m1(i,:),m2(i,:) pout dim(i,:) do j=1,COLS-(2*curdisp) dim(i,j) = (ref(i,j) - m1(i,j+curdisp))**2 + $ (ref(i,j) - m2(i,j+2*curdisp))**2 enddo endpdo return end subroutine generrimg(dim, cbe, cbd, curdisp, which) processor (5,2) real dim(ROWS+WINDOW,COLS) real cbe(ROWS,COLS,TASKBLOCKS) integer cbd(ROWS,COLS,TASKBLOCKS), curdisp, which template t2(ROWS+WINDOW) align dim(i,j) with t2(i) distribute t2(block(10)) template t1(ROWS+WINDOW) align cbe(i,j,k), cbd(i,j,k) with t1(i) distribute t1(block(10)) nocheck dim, cbe, cbd, curdisp,which real csum(COLS) real sum integer i, j, k,l logical flag flag=.true. pdo (i=1:ROWS) pin dim(i:i+WINDOW,:) pinout cbe(i,:,:),cbd(i,:,:) pbody if (flag) then do j=1,COLS csum(j) = 0.0 do k=0,WINDOW-1 csum(j) = csum(j) + dim(i+k,j) enddo enddo flag=.false. endif sum = 0.0 do j=1,7 sum = sum + csum(j) enddo if (sum .lt. cbe(i,1,which)) then cbe(i,1,which) = sum cbd(i,1,which) = curdisp endif do j=2,7 sum = sum + csum(j+6) if (sum .lt. cbe(i,j,which)) then cbe(i,j,which) = sum cbd(i,j,which) = curdisp endif enddo do j=8,COLS-6 sum = sum - csum(j-7) sum = sum + csum(j+6) if (sum .lt. cbe(i,j,which)) then cbe(i,j,which) = sum cbd(i,j,which) = curdisp endif enddo do j=COLS-5,COLS sum = sum - csum(j-7) if (sum .lt. cbe(i,j,which)) then cbe(i,j,which) = sum cbd(i,j,which) = curdisp endif enddo do j=1,COLS csum(j) = csum(j) - dim(i,j) + $ dim(i+WINDOW,j) enddo endpdo return end c c This routine currently just builds a standard imagethat testdata c can check. c c original by LeeAnn Tzeng c parallelized and updated by Peter A. Dinda c c subroutine getdata(ref,m1,m2) processor (3,4) integer ref(ROWS+WINDOW,COLS) integer m1(ROWS+WINDOW,COLS) integer m2(ROWS+WINDOW,COLS) template t2(ROWS+WINDOW) align ref(i,j), m1(i,j), m2(i,j) with t2(i) distribute t2(block(12)) nocheck ref, m1, m2 integer i, j, ipos,jpos pdo i=1,6 pout ref(i,:),m1(i,:),m2(i,:) pbody do j=1,COLS ref(i,j)=0 m1(i,j)=0 m2(i,j)=0 enddo endpdo pdo i=1,ROWS+WINDOW pout ref(i,:),m1(i,:),m2(i,:) pbody do j=1,COLS ref(i,j) =(i + j) m1(i,j) = (i + j) m2(i,j) = (i + j) enddo endpdo pdo i=ROWS+7,ROWS+WINDOW pout ref(i,:),m1(i,:),m2(i,:) pbody do j=1,COLS ref(i,j)=0 m1(i,j)=0 m2(i,j)=0 enddo endpdo ipos=(ROWS-ISIZE)/2 + 6 jpos=((COLS-JSIZE)/2) - TARDIS pdo i=ipos,ipos+ISIZE-1 pout ref(i,:),m1(i,:),m2(i,:) pbody do j=jpos,jpos+JSIZE-1 ref(i,j) = ((i-ipos+1) + (j-jpos+1)) m1(i,j+TARDIS) = ((i-ipos+1) + (j-jpos+1)) m2(i,j+2*TARDIS) = ((i-ipos+1) + (j-jpos+1)) enddo endpdo return end c c Routine to test output (matched with input routine above) c c original by LeeAnn Tzeng c cleaned up by Peter A. Dinda c subroutine testdata(disp) processor (3,4) integer disp(ROWS,COLS,TASKBLOCKS) template t1(ROWS+WINDOW) align disp(i,j,k) with t1(i) distribute t1(block(12)) nocheck disp integer ipos, jpos, i, j ipos=(ROWS-ISIZE)/2 jpos=((COLS-JSIZE)/2) - TARDIS do i=ipos+10,ipos+ISIZE-11 do j=jpos+10,jpos+JSIZE-11 if (disp(i,j,1).ne.TARDIS) then write (*,100) i,j,TARDIS,disp(i,j,1) end if end do end do do i=1,ipos-10 do j=1,jpos+JSIZE+2*TARDIS+9 if (disp(i,j,1).ne.0) then write (*,100) i,j,0,disp(i,j,1) end if end do end do do i=1,ipos+ISIZE+9 do j=jpos+JSIZE+2*TARDIS+9,COLS if (disp(i,j,1).ne.0) then write (*,100) i,j,0,disp(i,j,1) end if end do end do do i=ipos+ISIZE+9,ROWS do j=jpos-10,COLS if (disp(i,j,1).ne.0) then write (*,100) i,j,0,disp(i,j,1) end if end do end do do i=ipos-10,ROWS do j=1,jpos-10 if (disp(i,j,1).ne.0) then write (*,100) i,j,0,disp(i,j,1) end if end do end do 100 format("Error at position (",I3,",",I3,") - expected disp ",I, $ " but got ",I) return end subroutine dumparray(array, plane) integer array(ROWS,COLS,TASKBLOCKS) integer fxcellid, i, j, plane template t(ROWS) align array(i,j,k) with t(i) distribute t(block(20)) if ((fxcellid .eq. 24) .or. (fxcellid .eq. 28)) then print *,"Dumping array..." do i=1,ROWS do j=1,COLS print *,"img(",i,",",j,",",plane,")=", array(i,j,plane) enddo enddo endif end subroutine dumpfarray(array, plane) real array(ROWS,COLS,TASKBLOCKS) integer i, j, plane, ipos, jpos template t(ROWS) align array(i,j,k) with t(i) distribute t(block(20)) print *,"Dumping array..." ipos=(ROWS-ISIZE)/2 jpos=((COLS-JSIZE)/2) - TARDIS print *,"ipos=",ipos," jpos=",jpos do i=ipos+10,ipos+ISIZE-11 do j=jpos+10,jpos+JSIZE-11 print *,"array(",i,",",j,",",plane,")=", array(i,j,plane) end do end do c do i=1,ROWS c do j=1,COLS c print *,"img(",i,",",j,",",plane,")=", array(i,j,plane) c enddo c enddo end