\parskip 0in
\textheight 9in

\begin{slide}{}
\begin{center}

The NAS Parallel Benchmarks on Virtual Parallel Machines
	
S. White, A. \AA lund, V. Sunderam

Emory University

(Peter A. Dinda)

\end{center}
\end{slide}

\begin{slide}{}
\large Overview \normalsize
\parskip 0in
\begin{itemize}
\item Description of NAS ``Kernels''
\item Implementation of MG and FT
\item Platforms
\item Enhanced PVM
\item Communication Results
\end{itemize}
\end{slide}

\begin{slide}{}
\large NAS ``Kernels'' \normalsize \vspace{.5in}
\parskip 0in
\begin{itemize}
   \item EP: Embarrassingly Parallel \\
       \small $2^{28}$ Gaussian Random Deviates \normalsize
\parskip 0in
   \item MG: Multigrid Solver \\
       \small $\nabla^2 u = v$ on a $256^3$ grid \normalsize
\parskip 0in
   \item CG: Conjugate Gradient \\
       \small Smallest eigenvalue of sparse matrix \normalsize
\parskip 0in
   \item FT: Fourier Transform \\
       \small 3D FFTs on $256 \times 256 \times 128$ 
       array to solve PDE \normalsize
\parskip 0in
   \item IS: Integer Sort \\
       \small Rank $2^{23}$ integers \normalsize
\end{itemize}
\vspace{1in}
\begin{tabular}{|l|l|} 
\hline
Benchmark & Communication \\
\hline 
EP & None \\
MG & Structured Neighbor \\
CG & Unstructured Long Distance \\
FT & Structured Long Distance \\
IS & Connected Frequent Low Volume \\
\hline
\end{tabular}
\end{slide}


\begin{slide}
\large Implementations \normalsize \vspace{.5in}
\parskip 0in
\begin{itemize}
\item Implementations based on code for Intel iPSC/860
\parskip 0in
\item MG
   \begin{itemize} \small
   \item Block distribute one dimension with overlap
   \item Exchange borders
   \end{itemize} \normalsize
\vspace{2in}
\parskip 0in
\item FT 
   \begin{itemize} \small
   \item Block distribute one dimension
   \item Transpose
   \end{itemize} \normalsize
\vspace{2in}
\end{itemize}
\end{slide}



\begin{slide}
\large Environments \normalsize \vspace{.5in}

\begin{itemize}
\item Workstation Clusters 
\item Cray Y-MP/1
\item iPSC/860 32, 64, 128
\end{itemize}
\small
\begin{tabular}{|l|l|r|l|r|} 
\hline
\# & WS & MFLOP/s & Net & MB/s \\
\hline
16 & SS1+ & 28.8 & E-net & 1.25 \\
8 & RS/6000 & 146.0 & Shared FDDI & 12.5\\
9 & SGI R4000 & 88.0 & Switched FDDI & 50.0 \\
\hline
1 & Y-MP/1 & 270.0 & - & - \\
\hline
32 & i860 & 313.6 & - & - \\
64 & i860 & 627.2 & - & - \\
128 & i860 & 1254.4 & - & - \\
\hline
\end{tabular}
\normalsize
\end{slide}


\begin{slide}
\large Enhanced PVM \normalsize \vspace{.5in}

\begin{itemize}
\item Implemented additional library called fsend/frecv
\item Directly use TCP
\item Eliminated PVM separate buffer initialization and packing \\
      \small argue packing costs 12 \% of throughput \normalsize
\item Support heterogeneous environments, homogeneous message contents
\item Node-Node performance shows near maximum throughput 
\item Over three times the BW of PVM for messages larger than 1KB
\end{itemize}
\end{slide}




