\documentclass{llncs}

%\usepackage{times}
\usepackage{epsf}
%\usepackage{fullpage}

\newcommand{\comment}[1]{}

%\renewcommand{\dbltopfraction}{.9}
%\renewcommand{\topfraction}{.9}
%\renewcommand{\textfraction}{.1}
%\renewcommand{\floatpagefraction}{.9}
%\renewcommand{\dbltopnumber}{1}
%\renewcommand{\dblfloatpagefraction}{.9}

\def\pl{{\em playload}}
\def\Pl{{\em Playload}}


\def\colfigsize{\epsfxsize=2.5in}
\def\pagefigsize{\epsfxsize=6in}
\def\squashedpagedfigsize{\epsfxsize=6in \epsfysize=1.5in}


\begin{document}

\title{Realistic CPU Workloads\\
Through Host Load Trace Playback
\thanks{This research was sponsored by the Defense Advanced Research Projects
Agency (DARPA) and the Air Force Research Laboratory (AFRL), the
Office of Naval Research (ONR) and the Space and Naval Warfare Systems
Center (SPAWARSYSCEN), the Air Force Materiel Command (AFMC), the
National Science Foundation, and through an Intel Corporation
Fellowship.  } }
\titlerunning{Host Load Trace Playback}

\institute{
School of Computer Science, Carnegie Mellon University, 
and \\
Department of Computer Science, Northwestern University, 
\email{pdinda@cs.nwu.edu}
\and 
School of Computer Science and Department of Electrical
and 
\\Computer Engineering, Carnegie Mellon University,
\email{droh@cs.cmu.edu}
}

\author{Peter A. Dinda\inst{1} \and David R. O'Hallaron\inst{2}}
\authorrunning{Dinda, O'Hallaron}
\tocauthor{P. A. Dinda, D. R. O'Hallaron}


\maketitle

\begin{abstract}
This paper introduces {\em host load trace playback}, a new technique
for generating a background workload from a trace of the Unix load
average that results in realistic and repeatable CPU contention
behavior.  Such workloads are invaluable for evaluating various forms
of distributed middleware, including resource prediction systems and
application-level schedulers.  We describe the technique and then
evaluate a tool, \pl, that implements it. \Pl\ faithfully reproduces
workloads from traces on the four platforms on which we have evaluated
it.  Both \pl\ and a large set of host load traces are publicly
available from the web at the following URL:
http://www.cs.cmu.edu/$\sim$pdinda/LoadTraces
\end{abstract}

\section{Introduction}

Being able to generate a realistic and repeatable background CPU
workload is a necessary prerequisite to evaluating systems such as
application-level schedulers~\cite{APPLES-HPDC96}, dynamic load
balancers~\cite{DOME-TECHREPORT,SIEGELL-DYNAMIC-LB-94}, distributed
soft real-time systems~\cite{DINDA-CASE-BERT-WPDRTS-99}, and resource
prediction
systems~\cite{DINDA-LOAD-PRED-HPDC-99,WOLSKI-PRED-CPU-AVAIL-HPDC-99}.
Systems such as these react to the CPU contention produced by other
running programs in an effort to improve the performance of the
applications they serve.  To evaluate them, a mechanism to produce
realistic and repeatable CPU contention is necessary.

This paper introduces {\em host load trace playback}, a new technique
for generating a background workload that results in realistic and
repeatable contention for the CPU.  The workload is generated
according to a trace of the Unix load average, which measures the
time-varying CPU contention that was caused by a real workload running
on a real machine.  The objective of the playback process is to
reproduce this contention as accurately as possible given the playback
host and the machine on which the trace was recorded.  The playback
process can be configured to operate in a time-based manner, where the
contention only persists for the period of time reflected in the
trace, or in a work-based manner, where any additional contention due
to the foreground workload elongates the contention period of the
trace.  

We have developed a tool, \pl, that implements our technique and
collected a large and interesting family of traces with which to drive
it.  Both are publicly available on the web at
http://www.cs.cmu.edu/$\sim$pdinda/LoadTraces/.  In this paper, we
describe the technique, and demonstrate how the tool performs on four
different operating systems.


\section{Why host load trace playback?}

\begin{figure}[t]
\centerline{
\colfigsize
\epsfbox{load2time.eps}
}
\caption{Relationship between load and running time.}
\label{fig:load2time}
\end{figure}


Consider the problem of predicting the running time of a compute-bound
real-time task on each host in a shared, unreserved distributed
computing environment.  The goal is to schedule the task on the host
where it is most likely to meet its
deadline~\cite{DINDA-CASE-BERT-WPDRTS-99,DINDA-THESIS}. The running
time on a particular host is almost entirely determined by the CPU
contention the task will encounter, which is well measured by host
load, the Unix load average.  Specifically, we focus on the Digital
Unix five second load average, sampled at 1 Hz.  This resource
signal~\cite{DINDA-THESIS} is interesting because the running time of
a compute-bound task is directly related to the average load that it
encounters during execution.  For example, Figure~\ref{fig:load2time}
plots the running time and average load encountered by 42,000
randomized tasks on a Digital Unix machine.  The coefficient of
correlation between these quantities is nearly 1.0.

An example of a trace of host load can be seen in
Figure~\ref{fig:dux}(a).  We have developed a system that predicts the
future behavior of this resource signal and then transforms the
prediction into a running time
estimate~\cite{DINDA-LOAD-PRED-HPDC-99,DINDA-THESIS}.  To evaluate our
system, we must be able to produce realistic contention behavior on a
host.  The unique requirements of workload generation to evaluate
workload prediction systems motivated the technique and software
described in this paper.

A common approach to generating workloads is to create a statistical
model based on the measurement of real systems.  If the resulting
model captures the relevant statistical properties of the workload,
then it can be used to generate new workloads that also exhibit these
properties.  The problem lies in determining which properties are
relevant and whether all of the possibly relevant properties have been
discovered.  Both the networking and load balancing communities have
histories of choosing the wrong properties, resulting in unfortunate
system designs that have only recently been
corrected~\cite{SELF-SIM-NETWORK-TRAFFIC-TAQQU-JOURNAL-95,EXPLOIT-PROC-LIFETIME-DIST-DYN-LB-SIGMETRICS}.

When we studied the statistical properties of host load signals, we
found complex properties such as a strong autocorrelation structure,
self-similarity and epochal
behavior~\cite{DINDA-STAT-PROP-HOST-LOAD-SCIPROG-99}.  Predictive
models use these properties to create forecasts, and thus it is vital
to get them right in the model that the workload generator uses.
However, fixing the model in this way effectively pre-ordains the
performance of different predictors on the generated workload.
Furthermore, there may be other, undiscovered properties that are
relevant to prediction.

To avoid this conundrum, we decided to directly use traces of the host
load signal to create workloads.  The traces undeniably show real
contention on real machines.  Using the technique described in this
paper, we can reconstruct this contention by using a family of
processes with carefully modulated busy loops.  The resulting host
load can be measured and compared with the original load trace to see
how accurate the playback process is.  

It is important to point out that this process has limits.  Only CPU
contention is reproduced.  The busy loops obviously do not reproduce
any contention for the memory system, disk, or the network that may
have existed during the time the trace was recorded.  Furthermore, we
assume that all the processes that significantly contributed to the
contention recorded in the trace ran at the same priority level.  Low
priority processes increase the load average out of proportion to
their actual contention effects on higher priority processes.

Trace-based workload generation is not a new idea, but this is the
first work of which we are aware that uses the Unix load average, a
measure of contention, to recreate CPU contention through purely
application-level means.  Previous work, such as
DWC~\cite{MEHRA-WORKLOAD-GEN-LB-IEEE-PADT-95} and that of Kyung and
Hollingsworth~\cite{LINGER-LONGER-KYUNG-HOLLINGSWORTH-SC98}, creates
contention based on utilization measures.  DWC uses in-kernel
mechanisms to replay extremely fine grain traces of CPU utilization.
The target utilizations in the traces are ``doctored'' using process
counts in order to model how this workload would contend with
additional processes.  Kyung and Hollingsworth's application-level
workload generator uses course grain traces of CPU utilization to
parameterize a stochastic model that decides the length of fine grain
CPU bursts.  No provision appears to be made to adjust the target
utilization to reflect the actual contention that new processes would
see.  Our work is also in the spirit of trace modulation, which is
used to create realistic traffic in mobile
networks~\cite{NOBLE-TRACE-MODULATION-SIGCOMM97}. 


\section{Load trace playback}

To describe host load trace playback, we shall focus on a periodically
sampled trace, $\langle z'_i \rangle$, captured with a period of
$\Delta/2$ seconds.  The technique and our implementation can also
operate with non-periodically sampled traces.

\subsection{The playback algorithm}

\begin{figure}[t!]
\centerline{
\colfigsize
\epsfbox{sampling.eps}
}
\caption{Sampling process.}
\label{fig:sampling}
\end{figure}

The Unix load average is an exponential average of the number of
processes in the operating system's ready-to-run queue.  Conceptually,
the length of the ready queue is periodically sampled, and these
samples, $\langle x_i \rangle$, flow through an exponential filter
\begin{equation}
  z_{i} = (e^{-\Delta/\tau_{record}})z_{i-1} +
(1-e^{-\Delta/\tau_{record}})x_i
\label{eqn:kern}
\end{equation}
where $\Delta$ is the sampling interval and the application-visible
load average is the $\langle z_i \rangle$ series.  A load trace,
$\langle z'_i \rangle$, is gathered at the application level by
periodically sampling the the load average that the kernel makes
available and recording the time-stamped samples to a file.  The
sample rate should be at least twice as high as the sample rate of the
in-kernel process, so we rewrite the above equation as
\begin{equation}
   z'_{i} = (e^{-\Delta/2\tau_{record}})z'_{i-1} +
(1-e^{-\Delta/2\tau_{record}})x'_i .
\label{eqn:conv}
\end{equation}
For the Digital Unix traces we use in this paper, $\Delta/2=1$ Hz.
The sampling process used to generate a load trace is illustrated in
Figure~\ref{fig:sampling}.


The goal of host load trace playback is to force the measured load
average to track the load average samples in the trace file.  To do
this, we treat the $x'_i$ in the above equation is the {\em expected}
run-queue length during the last $\Delta/2$ seconds.  To determine the
$x'_i$, we can simply rewrite Equation~\ref{eqn:conv} as
\begin{equation}
   x'_i = \frac{z'_{i} - (e^{-\Delta/2\tau_{record}})z'_{i-1}}
               {1-e^{-\Delta/2\tau_{record}}} .
\label{eqn:deconv}
\end{equation}
This operation was first described by Arndt, et~al~\cite{WINNER-WS-98}
as a way of achieving a more dynamic load signal ($\langle x'_i
\rangle$) on typical Unix systems.  All that is necessary to perform
this step is knowledge of the smoothing constant, $\tau_{record}$, for
the machine on which the trace was taken.  For Digital Unix,
$\tau_{record}$ is 5 seconds.  On most other Unix systems,
$\tau_{record}$ is 60 seconds.  A larger $\tau_{record}$ value
indicates that the load average will behave more smoothly.

We treat the value $x'_i$ as the expected amount of contention the CPU
saw for the the time interval---the expected run queue length during
the interval.  To reproduce this contention, we split the interval
into smaller subintervals, each of which is larger than the scheduler
quantum, and then stochastically assign subprocesses to either work or
sleep during the subintervals.  For example, suppose $x'_i=1.5$.  We
would assign subprocess 0 to work during a subinterval with
probability 1, and subprocess 1 to work during a subinterval with
probability 0.5.

\begin{figure}[t!]
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{playback.eps}
&
\colfigsize
\epsfbox{loadgen.eps}
\\
(a) Playback & (b) Load generation \\
\end{tabular}
}
\caption{Playback process.}
\label{fig:playback}
\end{figure}

After each individual $x'_i$ value has been played, the load average
on the system is measured.  This measurement, $m_{i}$, can then be
compared with the target $z'_{i}$ value from the load trace if the
smoothing constant of the playback machine, $\tau_{playback}$, is the
same as that of the trace machine.  If
$\tau_{playback}\neq\tau_{record}$ then it is necessary to compute an
appropriate $z'_{i}$ from $x'_i$ using Equation~\ref{eqn:conv} and
$\tau_{playback}$.  The resulting $\langle z'_{i} \rangle$ is the
trace that would have resulted given the smoothing of the playback
machine.  Figure~\ref{fig:playback} illustrates the playback process
we have just described for a machine where
$\tau_{playback}=\tau_{record}$.


\subsection{Work-based versus time-based playback}

Most likely, there will be other processes on the system---the tasks
we will run during the evaluation, for example---and so it is
important to understand how the contention that the load playback tool
generates reacts to the contention produced by these processes.  Two
modes of operation are possible, time-based and work-based.

\begin{figure}
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{timebased.eps} &
\colfigsize
\epsfbox{workbased.eps} \\
(a) Time-based & (b) Work-based 
\end{tabular}
}
\caption{Example of time-based and work-based playback.}
\label{fig:modes}
\end{figure}

In the time-based mode of operation, the contention probabilities
implied by the $x'_i$ value last only until the end of $x'_i$'s
interval in real time.  Essentially, this means that the other load on
the machine can only amplitude modulate the played back load.  For
example, suppose there is a uniform 1.0 load on the machine and the
load trace dictates a 1.0 load from 0 to 1 second and zero elsewhere.
Then, the measured load will (ideally) be 2.0 from 0 to 1 second and
1.0 elsewhere.  Figure~\ref{fig:modes}(a) shows an example where a
real load trace (target load) is being played back using the
time-based mode in competition with an external continuous 1.0 load,
resulting in an amplitude modulated measured load.

In the work-based mode of operation, the load is interpreted as work
that must always be done, but which may be slowed down by other load.
This means that the other load on the system can also frequency
modulate the played load.  Using the same example as the previous
paragraph, the measured load would be 2.0 from 0 to 2 seconds and 1.0
elsewhere.  Figure~\ref{fig:modes}(b) shows an example where a real
load trace (target load) is being played back using the work-based
mode in competition with an external continuous 1.0 load, resulting in
an amplitude and frequency modulated measured load.


\subsection{Sources of playback error}
\label{sec:execpred.imperfect}

The measured load during playback tracks the desired load only
approximately in some cases.  The most obvious reason is that there
are other processes on the playback machine.  The additional
contention they introduce can drastically affect the measured load.
However, even on a reasonably idle machine, errors do occur.  This is
mainly because the OS's scheduler is functioning both as a sampling
processes and as part of the sampled process.

The sampling process that produces the $x_i$s in
Equation~\ref{eqn:kern} is the OS's scheduler, which sees only integer
numbers of processes on the ready queue.  We treat our sample of
$x_i$, $x'_i$, as the expected value of $x_i$, $E\lbrace x_i \rbrace$.
In playing back $x'_i$ we contrive to make the value that the
scheduler samples this expected value.  However, the second moment,
$E\lbrace x^2_i \rbrace$, is nonzero, so the actual value that is
sampled may be different even if the playback applies the correct work
at the correct times.

Consider the following contrived example.  Suppose $x'_i=0.5$, we have
a subprocess spend 50\% of its time sleeping and 50\% working, and we
alternate randomly between these extremes.  The expected run queue
length the scheduler would see is then $E\lbrace x_i\rbrace = (0.5)(1)
+ (0.5)(0) = 0.5$.  However, the scheduler will really sample either 1
or 0.  The effect on the load average in either case will be
drastically different than the expected value, resulting in error.
Furthermore, because the load is an exponential average, that error
will persist over some amount of time (it will decline to 33\% after
$\tau_{playback}$ seconds).  Another way of thinking about this is to
consider the second moment, $E\lbrace x_i^2\rbrace$.  For this
example, $E\lbrace x_i^2\rbrace = (0.5)(1)^2 + (0.5)(0)^2 = 0.5$, so
the standard deviation of the distribution of $E\lbrace x_i\rbrace$ is
$\sqrt{E\lbrace x_i^2 \rbrace-E\lbrace x_i \rbrace^2)} =
\sqrt{0.5-0.25} = 0.5$ which explains the variability in what the
scheduler will actually sample.

Even if the sample rate of the trace is low compared to the sample
rate of the scheduler, resulting in the $x'_i$'s corresponding
measurement being the aggregate of several observations by the
scheduler, any extant error is still propagated by the exponential
filter.

Another source of error is that an actual process on the ready queue
may not consume its full quantum.  



\subsection{Negative feedback}

It is reasonable to view host load playback as a control system in
which the goal is to force the output host load measurements to
accurately track the input measurements in the load trace.  It is
natural to consider using negative feedback to achieve more accurate
operation.  Our implementation of host load playback, which will be
described shortly, supports a negative feedback mechanism.  Using this
mechanism, it can sometimes more accurately track the load average in
a trace.

\begin{figure}[t!]
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{feedbackbad.eps} &
\colfigsize
\epsfbox{feedbackgood.eps} \\
(a) Implemented feedback mechanism & (b) Ideal feedback mechanism
\end{tabular}
}
\caption{Negative feedback mechanisms}
\label{fig:feedback}
\end{figure}


There are, however, two problems with relying on this mechanism.
First, the frequency of the control loop is the measurement frequency,
which is considerably lower than the frequency at which work is
applied.  This makes feedback unstable.  It would appear to be
possible to up-sample the trace and implement measurement at a finer
granularity to avoid this problem.  However, the second problem with
feedback is somewhat more daunting.  The measurements are of the {\em
total} load on the system, but the goal of feedback is to control
merely the load being applied by the playback tool, not the external
load produced by foreground processes.  To determine the real playback
errors, it would be necessary to separate the measured load into the
contributions of the host load playback tool and of other processes
running on the host.  In general, this is difficult because so much
information has been lost by the time measurements are taken.

Figure~\ref{fig:feedback} shows both our implemented feedback
mechanism (a) and what an ideal feedback mechanism would look like
(b).  Implementing the box denoted ``Signal Separation'' in
Figure~\ref{fig:feedback}(b) is necessary both for making feedback
useful when there is external load and for accurately determining the
effect of the predicted load on a new process when the host is heavily
utilized.  Because of its broad utility, we are currently exploring
ways to achieve this functionality.

Because of these difficulties, we do not use negative feedback in the
evaluation that follows.

\subsection{Real traces versus synthetic traces}

In almost all cases, when a real load trace that has been sampled at
a sufficiently high rate is played, the $x'_i$s are close to integers.
Intuitively, this makes sense---the scheduler can only observe
integral numbers of processes on the ready queue, and if our estimates
of its observations (the $x'_i$s) are accurate, they should also mostly
be integers.  

It's easy to construct synthetic traces that are actually not sensible
in terms of short term behavior.  One such trace is the continuous 0.5
example discussed in the previous section.  Deconvolving that trace
produces $x'_i$s slightly above 0.5.  Load playback reasonably
produces a situation where the expected ready queue length is 0.5 by
having a process spend half of its time working and half sleeping.
However, the observations the kernel (the $x_i$s) will make will be
either 0 or 1.  Thus the measured load trace will vary widely.  The
average load average will match to the 0.5 we desire, but the short
term behavior will not.  It's important to note that the load playback
tool is doing what the user probably wants (keeping the CPU 50\% busy,
which can be verified with vmstat.) It is simply the case that the
load average fails to be a good measure for how well the generated
load conforms.

Another way a synthetic trace can fail is to have very abrupt changes.
For example, a square wave will be reproduced with lots of overshoot
and error.  In order for the abrupt swings of a square wave to have
been the output of the exponential smoothing, the input $x_i$ must have
been much more abrupt, and the estimate $x'_i$ will also be quite
large.  This means load playback has to have many processes try to contend
for the CPU at once, which raises the variability considerably.

The best way to produce synthetic traces is to create a string of
integer-valued $x_i$s and smooth them with the appropriate
$\tau_{record}$.  Another possibility is to present these $x_i$s
directly to load playback with a very small $\tau_{record}$ trace
value.

\subsection{Implementation}

We implemented the host load trace playback technique in a tool called
\pl.  \Pl\ consists of approximately 2600 lines of ANSI C and uses
only standard Unix calls.  We have successfully ported it to a number
of platforms, including Digital Unix, Solaris, FreeBSD, and Linux.

Because it is vital that $\tau_{record}$ be known, we provide a tool
which estimates this value using the decay time after a CPU burst on
the otherwise unloaded machine.  Another implementation issue is how
many subintervals should be generated for each sample in the load
trace.  More subintervals tend to smooth out kernel sampling
fluctuations, but, on the other hand, fewer subintervals reduce the
overhead of the system.  We have resolved this tension experimentally,
finding approximately 120 subintervals per second to be appropriate
for Digital Unix, and about 30 subintervals per second to be
appropriate for the other platforms.



\section{Evaluation}

In this section, we evaluate the performance of \pl\ on Digital Unix,
Solaris, FreeBSD, and Linux.  The evaluation is based on a playing
back a one hour trace (3600 samples) recorded on a Digital Unix
workstation in the Pittsburgh Computing Center's Alpha cluster on
August 23, 1997.  In each case, the playback is work-based, no
feedback is used, and the machine is otherwise quiescent.

\begin{figure}[t!]
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{manch-7.good.compare.3600.out.targettrace.eps}
&
\colfigsize
\epsfbox{manch-7.good.compare.3600.out.errhist.eps}
\\
(a) Target trace  & (c) Error histogram
\\
\colfigsize
\epsfbox{manch-7.good.compare.3600.out.measuredtrace.eps}
&
\colfigsize
\epsfbox{manch-7.good.compare.3600.out.erracf.eps}
\\
(b) Measured trace  &  (d) Error autocorrelation
\\
\end{tabular}
}
\caption{Trace playback on a Digital Unix Machine.}
\label{fig:dux}
\end{figure}

Figure~\ref{fig:dux} illustrates the performance of \pl\ in playing
back the trace on another Digital Unix machine.  The format of the
figures for the other machines will be the same.
Figure~\ref{fig:dux}(a) shows the target signal on the machine,
$\langle z'_i \rangle$.  It is identical to the signal recorded in the
load trace because this machine has a smoothing constant that is
identical to that of the trace machine.  Figure~\ref{fig:dux}(b) shows
the measured result, $\langle m_i \rangle$.  As we can see, the
measured signal closely tracks the target signal, with the exception
of occasional outliers.  Most of these are associated with target
loads that are non-integral, resulting in the kind of sampling errors
described above.  Figure~\ref{fig:dux}(c) shows the distribution of
all of the playback errors, $\langle m_i - z'_i \rangle$.  The
distribution is approximately normal.  As we can see, most errors are
actually quite small.  Furthermore, the errors are essentially
uncorrelated over time, as can be seen by their autocorrelation
function, plotted in Figure~\ref{fig:dux}(d).

\begin{figure}[t!]
\centerline{
\begin{tabular}{l|c|c|c|c|c|c}
 & \multicolumn{2}{c|}{Target} & \multicolumn{2}{c|}{Measured} & \multicolumn{2}{c}{Error} \\
Environment & Mean & StdDev & Mean & StdDev & Mean & StdDev \\
\hline
Alpha/DUX 4.0 & 1.065 & 0.465 & 1.047 & 0.442 & -0.018 & 0.127  \\
Sparc/Solaris 2.5 & 1.047 & 0.376 & 1.122 & 0.356 & 0.076 & 0.061 \\
PII/FreeBSD 2.2 & 1.047 & 0.376 & 1.123 & 0.361 & 0.076 & 0.124  \\
PII/Linux 5.2 & 1.047 & 0.376 & 1.131 & 0.360 & 0.084 & 0.164 \\
\end{tabular}
}
\caption{Cumulative errors (3600 samples).}
\label{fig:cumerr}
\end{figure}

Figure~\ref{fig:cumerr} presents summary statistics for the target,
measured, and error signals on all the machines we tested.  For
playback under Digital Unix, note that the mean and standard deviation
of the target and measured signals are nearly identical, which shows
that \pl\ is reconstructing all of the trace's work and dynamics.  The
mean error is slightly less than zero, which indicates that either
\pl\ is overall producing slightly less work than needed.  The error's
high standard deviation reflects the outliers.

\begin{figure}[t!]
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{federation.good.compare.3600.out.targettrace.eps}
&
\colfigsize
\epsfbox{federation.good.compare.3600.out.errhist.eps}
\\
(a) Target trace  & (c) Error histogram
\\
\colfigsize
\epsfbox{federation.good.compare.3600.out.measuredtrace.eps}
&
\colfigsize
\epsfbox{federation.good.compare.3600.out.erracf.eps}
\\
(b) Measured trace  &  (d) Error autocorrelation
\\
\end{tabular}
}
\caption{Trace playback on a Solaris Machine.}
\label{fig:solaris}
\end{figure}

Using the same format as before, Figure~\ref{fig:solaris} shows how
\pl\ performs on a Solaris machine.  The target signal for Solaris is
considerably smoother than that for Digital Unix because Solaris uses
a 60 second smoothing constant, as do the remainder of the machines in
this evaluation.  We can see that playback works beautifully on this
machine, producing a nearly normal distribution of errors with few
outliers.  The errors are slightly more correlated over time than on
Digital Unix, however.  Also, as we can see from
Figure~\ref{fig:cumerr}, slightly more work than desired is being
generated.


\begin{figure}[t!]
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{greenfield.good.compare.3600.out.targettrace.eps}
&
\colfigsize
\epsfbox{greenfield.good.compare.3600.out.errhist.eps}
\\
(a) Target trace  & (c) Error histogram
\\
\colfigsize
\epsfbox{greenfield.good.compare.3600.out.measuredtrace.eps}
&
\colfigsize
\epsfbox{greenfield.good.compare.3600.out.erracf.eps}
\\
(b) Measured trace  &  (d) Error autocorrelation
\\
\end{tabular}
}
\caption{Trace playback on a FreeBSD Machine.}
\label{fig:freebsd}
\end{figure}

Figure~\ref{fig:freebsd} shows playback on a FreeBSD machine.  This
machine exhibits considerably more error than the Solaris machine.
However, the error distribution is nearly normal and a bit less
correlated over time.  In terms of the cumulative error, we can see
from Figure~\ref{fig:cumerr} that playback produces slightly more work
than desired.  The error is biased slightly positive, showing that,
overall, about 7--8\% more contention is being observed than desired.
This is similar to the Solaris and Linux machines.  The Solaris
machine shows the lowest variability in error, while the FreeBSD
machine has about the same error variability as the Digital Unix
machine.

\begin{figure}[t!]
\centerline{
\begin{tabular}{cc}
\colfigsize
\epsfbox{infocom.good.compare.3600.out.targettrace.eps}
&
\colfigsize
\epsfbox{infocom.good.compare.3600.out.errhist.eps}
\\
(a) Target trace  & (c) Error histogram
\\
\colfigsize
\epsfbox{infocom.good.compare.3600.out.measuredtrace.eps}
&
\colfigsize
\epsfbox{infocom.good.compare.3600.out.erracf.eps}
\\
(b) Measured trace  &  (d) Error autocorrelation
\\
\end{tabular}
}
\caption{Trace playback on a Linux Machine.}
\label{fig:linux}
\end{figure}

Figure~\ref{fig:linux} shows how \pl\ performs on a Linux machine.
\Pl\ performs more poorly on Linux than on any of the other machines
we have tested.  However, the measured load signal does track the
general outline of the target load signal.  The error distribution is
not even approximately normal, and is obviously biased toward positive
errors.  This leads to the cumulative error being about 8\% higher
than desired, and to a much greater variability in that error, as can
be seen in Figure~\ref{fig:cumerr}.  However, it is important to point
out that, as with all the other machines, \pl\ on Linux produces a
measured load signal which exhibits the desired variability.  At this
point, we do not understand exactly why \pl\ behaves so differently on
Linux.


\section{Conclusions and future work}

We have introduced {\em host load trace playback}, a new technique for
generating a background workload from a trace of the Unix load average
that results in realistic and repeatable contention for the CPU.  We
described the technique and then evaluated the performance of an
implementation on several different platforms.  The implementation,
\pl, works quite well in reproducing the overall behavior of the trace in
terms of low, roughly normally distributed error that has little
correlation over time. \Pl\ works better on Digital Unix, Solaris, and
FreeBSD machines than on Linux machines, but at this point we don't
know why.  We have used \pl\ extensively in our evaluation of a
real-time scheduling advisor based on the on-line prediction of task
running time~\cite{DINDA-THESIS}.

There are several ways in which we are planning to extend the work
described in this paper.  First, we hope to understand and improve the
inferior performance we see on Linux.  Second, we are exploring how to
separate out the effects of load applied by \pl\ and external load
applied by other programs.  If this can be done, it would let us use
negative feedback to make playback even more accurate.  Furthermore,
this signal separation step would be extremely helpful in the context
of prediction-based systems, where it is often necessary to decompose
predictions into contributions by individual applications.  The third
way in which we intend to build on this work is to explore the finer
grain measurement facilities that are available on some operating
systems, perhaps developing some such facilities ourselves.  Using these
mechanisms would allow us to provide workloads that are more
appropriate for evaluating systems that use very short-lived tasks.
Finally, we plan to look at developing and integrating trace playback
tools for disk, memory, network, and other workloads.

\pagebreak

\Pl\ and a large collection of host load traces that we described in
an earlier paper~\cite{DINDA-STAT-PROP-HOST-LOAD-SCIPROG-99} are
available from the web at the following URL: \\
\centerline{\em http://www.cs.cmu.edu/$\sim$pdinda/LoadTraces}\\
We hope that the combination of \pl\ and these traces can serve as the
basis for standardized benchmarks to evaluate the tools of the
distributed computing community.

%\small
%\bibliographystyle{acm}
%\bibliography{texbib}
%\normalsize

\begin{thebibliography}{10}

\bibitem{DOME-TECHREPORT}
{\sc Arabe, J., Beguelin, A., Lowekamp, B., E.~Seligman, M.~S., and Stephan,
  P.}
\newblock Dome: Parallel programming in a heterogeneous multi-user environment.
\newblock Tech. Rep. CMU-CS-95-137, Carnegie Mellon University, School of
  Computer Science, April 1995.

\bibitem{WINNER-WS-98}
{\sc Arndt, O., Freisleben, B., Kielmann, T., and Thilo, F.}
\newblock Dynamic load distribution with the winner system.
\newblock In {\em Proceedings of Workshop Anwendungsbezogene Lastverteilung
  (ALV'98)\/} (March 1998), pp.~77--88.
\newblock Also available as Technische Universität München Technical Report
  TUM-I9806.

\bibitem{APPLES-HPDC96}
{\sc Berman, F., and Wolski, R.}
\newblock Scheduling from the perspective of the application.
\newblock In {\em Proceedings of the Fifth {IEEE} Symposium on High Performance
  Distributed Computing {HPDC96}\/} (August 1996), pp.~100--111.

\bibitem{DINDA-CASE-BERT-WPDRTS-99}
{\sc Dinda, P., Lowekamp, B., Kallivokas, L., and O'Hallaron, D.}
\newblock The case for prediction-based best-effort real-time systems.
\newblock In {\em Proc. of the 7th International Workshop on Parallel and
  Distributed Real-Time Systems (WPDRTS 1999)}, vol.~1586 of {\em Lecture Notes
  in Computer Science}. Springer-Verlag, San Juan, PR, 1999, pp.~309--318.
\newblock Extended version as {CMU} Technical Report {CMU-CS-TR-98-174}.

\bibitem{DINDA-STAT-PROP-HOST-LOAD-SCIPROG-99}
{\sc Dinda, P.~A.}
\newblock The statistical properties of host load.
\newblock {\em Scientific Programming 7}, 3,4 (1999).
\newblock A version of this paper is also available as {CMU} Technical Report
  {CMU-CS-TR-98-175}. A much earlier version appears in {LCR '98} and as
  {CMU-CS-TR-98-143}.

\bibitem{DINDA-THESIS}
{\sc Dinda, P.~A.}
\newblock {\em Resource Signal Prediction and Its Application to Real-time
  Scheduling Advisors}.
\newblock PhD thesis, School of Computer Science, Carnegie Mellon University,
  May 2000.
\newblock Available as Carnegie Mellon University Computer Science Department
  Technical Report CMU-CS-00-131.

\bibitem{DINDA-LOAD-PRED-HPDC-99}
{\sc Dinda, P.~A., and O'Hallaron, D.~R.}
\newblock An evaluation of linear models for host load prediction.
\newblock In {\em Proceedings of the 8th {IEEE} International Symposium on High
  Performance Distributed Computing ({HPDC '99})\/} (August 1999), pp.~87--96.
\newblock Extended version available as {CMU} Technical Report
  {CMU-CS-TR-98-148}.

\bibitem{EXPLOIT-PROC-LIFETIME-DIST-DYN-LB-SIGMETRICS}
{\sc Harchol-Balter, M., and Downey, A.~B.}
\newblock Exploiting process lifetime distributions for dynamic load balancing.
\newblock In {\em Proceedings of {ACM} {SIGMETRICS} '96\/} (May 1996),
  pp.~13--24.

\bibitem{MEHRA-WORKLOAD-GEN-LB-IEEE-PADT-95}
{\sc Mehra, P., and Wah, B.}
\newblock Synthetic workload generation for load-balancing experiments.
\newblock {\em {IEEE} Parallel and Distributed Technology 3}, 2 (Summer 1995),
  4--19.

\bibitem{NOBLE-TRACE-MODULATION-SIGCOMM97}
{\sc Noble, B.~D., Satyanarayanan, M., Nguyen, G.~T., and Katz, R.~H.}
\newblock Trace-based mobile network emulation.
\newblock In {\em Proceedings of the ACM SIGCOMM Conference\/} (Cannes, France,
  September 1997).

\bibitem{LINGER-LONGER-KYUNG-HOLLINGSWORTH-SC98}
{\sc Ryu, K.~D., and Hollingsworth, J.~K.}
\newblock Fine-grain cycle stealing for networks of workstations.
\newblock In {\em Proceedings of {ACM/IEEE} {SC98} (Supercomputing '98)\/}
  (November 1998), pp.~801--821.

\bibitem{SIEGELL-DYNAMIC-LB-94}
{\sc Siegell, B., and Steenkiste, P.}
\newblock Automatic generation of parallel programs with dynamic load
  balancing.
\newblock In {\em Proceedings of the Third International Symposium on
  High-Performance Distributed Computing\/} (August 1994), pp.~166--175.

\bibitem{SELF-SIM-NETWORK-TRAFFIC-TAQQU-JOURNAL-95}
{\sc Willinger, W., Taqqu, M.~S., Leland, W.~E., and Wilson, D.~V.}
\newblock Self-similarity in high-speed packet traffic: Analysis and modeling
  of ethernet traffic measurements.
\newblock {\em Statistical Science 10}, 1 (January 1995), 67--85.

\bibitem{WOLSKI-PRED-CPU-AVAIL-HPDC-99}
{\sc Wolski, R., Spring, N., and Hayes, J.}
\newblock Predicting the {CPU} availability of time-shared unix systems.
\newblock In {\em Proceedings of the Eighth {IEEE} Symposium on High
  Performance Distributed Computing {HPDC99}\/} (August 1999), {IEEE},
  pp.~105--112.
\newblock Earlier version available as UCSD Technical Report Number CS98-602.

\end{thebibliography}


\end{document}
