% THINGS TO ADD:

\documentstyle[12pt,twocolumn,psfig]{article}
%\documentstyle[12pt,psfig]{article}
%\documentstyle[twocolumn,psfig]{article}
%\documentstyle[psfig]{article}

%\psdraft

%\input{colt.bib}



% FORMAT SPECIFICATIONS

% My attempt at fitting 40 lines onto each page. (Use w/12pt) Yucks.
\columnsep=0.25in
\setlength{\topmargin}{0.4in}
\setlength{\headsep}{0pt}
\setlength{\headheight}{0pt}
\setlength{\textheight}{8.0in}
\setlength{\textwidth}{6.8in} 
\setlength{\oddsidemargin}{0in}
\addtolength{\leftmargin}{-0.6in}

\iffalse
% Original formatting commands (as of 12/12/95)
\columnsep=0.25in
\setlength{\topmargin}{-0.15in}
\setlength{\headsep}{0pt}
\setlength{\headheight}{0pt}
\setlength{\textheight}{9.0in}
\setlength{\textwidth}{6.8in} 
\setlength{\oddsidemargin}{0in}
\addtolength{\leftmargin}{-0.6in}
\fi

\iffalse
\columnsep=0.25in
\setlength{\topmargin}{-0.250in}
\setlength{\headsep}{0pt}
\setlength{\headheight}{0pt}
\setlength{\textheight}{8.8in}
\setlength{\textwidth}{6.75in} %\setlength{\textwidth}{6in}
\setlength{\oddsidemargin}{0in}
\addtolength{\leftmargin}{-0.5in}
% \renewcommand{\baselinestretch}{0.95}
\fi

\begin{document}
%\begin{onecolumn}

\newlength{\linelength}
\setlength{\linelength}{0.35\textwidth}

\pagenumbering{arabic}

\def\Proof {{\bf Proof: \enspace}}
\def\argmin {{\rm argmin}}
\def\hb {\hfil\break}
\def\entropy {{\cal H}}
\def\implies {\Rightarrow}
%\def\ehat {\hat\epsilon}
\def\ehat {\hat\varepsilon}
\def\nhat {\hat{n}}
\def\ehatbar{\overline{\hat\varepsilon}}
\def\iid {{\it i.i.d.}}
\def\Pr {{\rm Pr}}
\def\E {{\rm E}}
\def\nopt {{n_{\it opt}}}
\def\nopthat {\widehat{n_{\it opt}}}
\def\testeq {\stackrel{?}{=}}
\def\hstar {h^{\ast}}
\def\kopt {{k_{\it opt}}}
\def\kopthat {\widehat{\kopt}}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{observation}{Interesting Observation}

\title{Preventing ``Overfitting'' of Test Data \\
          in Hypothesis Selection  
	\\ {\tenbf (DRAFT: Please do not distribute.)} 
		}

\author{
Andrew Y. Ng
\\
\makebox[\linelength]{Carnegie Mellon University}\\
\makebox[\linelength]{Pittsburgh, Pennsylvania} 
%\\
%\makebox[\linelength]{Andrew.Ng@cs.cmu.edu}
%\and
%Michael Kearns\\
%\makebox[\linelength]{AT\&T Bell Laboratories}\\
%\makebox[\linelength]{Murray Hill, New Jersey}
}

\date{\today}
\maketitle
\thispagestyle{empty}

%---------------------------------------------------------------------
% Main text starts here. 
% START

\begin{abstract}

When selecting a hypothesis from a set of hypotheses for a
learning task, it is common practice to evaluate them on 
a test set, and then to select the hypothesis that had the 
lowest test error. But when the test set is partially corrupted 
such as by noise, ``folklore'' also warns about overfitting 
the test data. In this paper, we explain how this phenomenon occurs, 
and show the surprising result that it can be overcome 
by selecting a hypothesis with a {\em higher} test-error, over others
with lower test error. 
We give two reasons for not picking the 
hypothesis with the lowest test error, and also propose a new 
algorithm, LOOCVCV, that uses a computationally efficient form 
of leave--one--out cross validation to select such a hypothesis. 
Finally, we present experimental results for one domain, that show LOOCVCV 
consistently beating picking the hypothesis with the lowest test 
error, even when using reasonably large test sets.
\end{abstract}

\section{Introduction}
		\label{section-intro}	

When selecting a hypothesis out of a set of hypotheses 
for a learning task, it is common practice to test all of them 
on some set of previously unseen test data, and then to pick the 
hypothesis that had the smallest test error \cite{stone74,stone78}.
For example, we may generate a set of hypotheses by training
multiple Neural Networks 
with Backpropagation on a fixed set of training data, with each 
Neural Network having different initial weights, and then pick
the Neural Network that had the lowest error on a set of 
hold-out test data. But if the test data is partially corrupted 
by noise, then ``folklore'' also warns that if we used too small a 
set of test data to test too large a number of hypotheses, we may
end up picking a poor hypothesis that fit the corrupted test data well 
``just by chance.''

In this paper, we examine this problem of ``overfitting'' of test 
data. Our contributions are two-fold: First, we explain how the 
this overfitting phenomenon occurs, and show the surprising 
result that we can do better than picking the hypothesis 
with the lowest test error. Second, we present a new algorithm, 
{\em Leave-one-out Cross Validated Cross Validation} (LOOCVCV), 
to choose such a hypothesis, and show experimentally that it 
consistently beats picking the minimum--test--error hypothesis in 
one domain. 

% HERE
%[These results also extend to using Cross-Validation for model selection.]

\section{Definitions}
		\label{section-definitions}
Henceforth, for simplicity, let us assume a boolean target function $f$.
Let $H=\{h_1, h_2, \ldots , h_n\}$ be the set of $n$ hypotheses that 
we are choosing from, and let us also assume that each hypothesis was 
drawn \iid\/ from some fixed distribution of hypotheses $D_H$. Also, let 
$S = \{(x_1,y_1), (x_2,y_2), \ldots, (x_m, y_m)\}$ be the set of m samples
forming the test data, where each $x_i$ is an arbitrary-dimensioned input 
vector drawn \iid\/ from some distribution $D$, and $y_i \in \{0,1\}$,
the labels of the data, have independently been corrupted 
with some fixed {\em noise rate} $\eta$, so
that $y_i = f(x_i)$ with probability $1-\eta$, and $y_i = \neg f(x_i)$ with
probability $\eta$. We shall also write $p_i = (x_i, y_i)$, so that 
$S = \{p_1, p_2, \ldots p_m\}$, and let us also define 
$S_{-i} = S \setminus p_i$.
Furthermore, we shall say that hypothesis $h_i$ 
{\em apparently misclassifies} some element of the test data 
$(x_j,y_j)$ if $h_i(x_j) \neq y_j$, and that it 
{\em truly misclassifies} it if $h_i(x_j) \neq f(x_j)$. 
Let the test error $\ehat(h_i) \in [0,1]$ be the fraction of the test data 
that hypothesis $h_i$ apparently 
misclassifies, and let the generalization error (with respect to
uncorrupted data) of each hypothesis $h_i$ 
be $\varepsilon(h_i) = {\rm Pr}_{x\in D}[h_i(x) \neq f(x)]$
(following the PAC framework).
Finally, the hypothesis $h_i$ in a set $H$ of size $n$ 
that has that has the lowest test error $\ehat(h_i)$
will be called the ``best--of--$n$ hypothesis'' 

Then, informally, the goal of hypothesis selection is to choose a 
hypothesis $h$ such that $\varepsilon(h)$ is expected to be ``small,''
and the conventional way of attempting to do so is choosing the 
best--of--$n$ hypothesis.

\section{Overfitting by a Simulated Learning Algorithm}
		\label{section-simulated-algorithm}

As mentioned in Section~\ref{section-intro}, when we choose the 
hypothesis $h$ that has the lowest $\ehat(h)$, we may be choosing a
poor hypothesis that had fit the test data well ``just by chance.''
For example, if $\ehat(h) = 0$ so that it apparently classifies 
the test data perfectly, but some of the test data's labels have been
corrupted, then the hypothesis must actually be {\em truly misclassifying} 
the corrupted samples in the test data, and must therefore
have some non-zero (and possibly large) generalization error. 

\subsection{A Simulated Learning Algorithm}

To examine the overfitting effect, let us first look at a 
model of a learning algorithm, that, while unrealistic in some
ways, will help isolate reasons for the overfitting.
Our simulated learning algorithm and learning task have the 
following characteristics:

\begin{itemize}
  \item We have a black box that draws successive hypothesis $h_i$ \iid\ 
		from $D_H$, which is such that 
		$\varepsilon(h_i) \sim {\rm Uniform}(0,1)$.
  \item We have $m=100$ test samples, and exactly 20 of this set had its
		labels corrupted by noise
  \item Each hypothesis $h_i$ has an {\em independent}\/ $\varepsilon(h_i)$
		chance of misclassifying each of the $m$ test samples,
		where the independence is across both hypotheses and test
		samples.
\end{itemize}

So, to simulate generating and testing a hypothesis with a given
$\varepsilon(h_i)$, we would flip a 0--1 coin with bias 
$\varepsilon(h_i)$ 100 times, negate the result of (say) the 
last 20 flips, and finally calculate the fraction of resulting
`1's to get $\ehat(h_i)$. Note also that of the assumptions above, 
the one of uncorrelated errors across hypotheses is probably the most 
unrealistic, but we include it to have a simple and analyzable 
model of a learning algorithm.
%And before proceeding, we also wish to note that we also found 
%empirically that our results are only very weakly dependant on the 
%that $\varepsilon(h_i)$ is distributed ${\rm Uniform}(0,1)$. 
The goal then, is to
generate some number of hypotheses, test them on the $100$-sample 
test data, and then to somehow pick a hypothesis $h$ so that 
${\rm E}[\varepsilon(h)]$, the expected generalization 
error of $h$, is ``small.''

\subsection{Why Picking the ``Best'' is Bad}
	\label{subsection-why-best-bad}

In this section, we give two reasons why picking the
``apparent best'' hypothesis may not be optimal when the test
data's labels have been corrupted by noise:
\begin{itemize}
  \item Lower $\ehat(h) \protect\not\implies$ 
			Lower ${\rm E}[\varepsilon(h)|\ehat(h)]$,
		because of differing {\em variances} of
		$\ehat(h)|\varepsilon(h)$'s.
  \item If errors between the hypotheses are uncorrelated, then some
		``mean'' hypothesis, that somehow agrees with most of the 
		other hypotheses, can be expected to do well. And at least 
		on one metric (to be explained), the best--of--$n$ 
		hypothesis is maximally different from doing this.
\end{itemize}

\subsubsection{Lower Test Error \protect$\not\implies$ 
				Lower Generalization Error}
	\label{section-lower-not-imply}

To motivate the rest of this discussion, let us first consider a simpler
distribution of hypotheses that shows this effect clearly. Again, let 
us assume that exactly $20$ of the $100$ test data labels were corrupted, 
and now let us also assume $D_H$ is such that each hypothesis $h$ has 
either $\varepsilon(h) = 0.00$ or $\varepsilon(h) = 0.10$. Now, let 
us consider the particular case of choosing between two 
hypotheses $h_1$ and $h_2$, where $\ehat(h_1) = 0.17$, and 
$\ehat(h_2) = 0.20$. Suppose we chose $h_1$. Then, we know that 
we must necessarily have 
chosen a hypothesis with $\varepsilon(h) = 0.10$, because if 
$\varepsilon(h) = 0.00$, then $h$ would have classified all of 
the test sample correctly with respect to the true target function, 
and would therefore have had $\ehat(h) = 0.20$ because $20$ elements of 
the test sample were corrupted. 
On the other hand, if we chose $h_2$ with $\ehat(h_2) = 0.20$, then 
we have a non-zero chance of picking a hypothesis with 
$\varepsilon(h) = 0.00$. Hence, 
$E[\varepsilon(h)|\ehat(h)=0.17] > E[\varepsilon(h)|\ehat(h)=0.20]$ 
for this case. 

A slightly more general explanation for the hypothesis with 
lower $\ehat(h)$ not having lower $E[\varepsilon(h)|\ehat(h)]$ is in
the differing {\em variances} of $\ehat(h)$. 
$\ehat(h)$, as a random variable dependant on $\varepsilon(h)$, is 
such that  $\E[\ehat(h)|\,\varepsilon(h)]$ monotonically increases with 
$\varepsilon(h)$ as expected; but, its 
variance ${\rm Var}(\ehat(h) |\, \varepsilon(h))$
also increases as a function of $\varepsilon(h)$ (at least
for $\varepsilon(h) \in [0,0.5]$). For example, if $\varepsilon(h) = 0$, 
then $\ehat(h) = 0.20$ always, so that
${\rm Var}(\ehat(h)|\,\varepsilon(h)=0) = 0$; 
but ${\rm Var}(\ehat(h)|\,\varepsilon(h)=0.5) = 0.25 > 0$.
So, in spite of hypotheses with lower $\varepsilon(h)$'s having 
lower ${\rm E}[\ehat(h)|\varepsilon(h)]$'s, it is possible for them
to have have a {\em smaller} probability of having ``extremely low'' 
$\ehat(h)$'s ``by chance,'' because their distributions of $\ehat(h)$ 
are less spread-out (lower variance). And for 
certain priors such as the simplified $D_H$ above, this leads to the
effect of 
${\rm low}\: \ehat(h) \not\implies {\rm low}\: \E[\varepsilon(h)|\,\ehat(h)]$,
which causes the overfitting phenomenon. 

Finally, let us return to the original model. Recall that 
exactly $20$ of the $100$ test data labels were corrupted. Using 
this and that $\varepsilon(h_i) \sim {\rm Uniform}(0,1)$,
we can calculate, for each value of $\ehatbar$, the 
posterior distribution of $\varepsilon(h) | \ehat(h) = \ehatbar$:
\begin{eqnarray}
f(\varepsilon(h) | \ehat(h)=\ehatbar) &=&
		\frac{f_{\ehat|\varepsilon}(\ehat(h)=\ehatbar|\varepsilon(h)) f_\varepsilon(\varepsilon(h))}
			{f_{\ehat}(\ehat(h)=\ehatbar)}  \nonumber \\
		&=& k f_{\ehat|\varepsilon}(\ehat(h)=\ehatbar|\varepsilon(h)) 
					\nonumber \\
		&=& k \sum_{i=0}^{100\ehatbar} (B(80,\varepsilon(h), i) 
					\nonumber \\
		& & \hbox to 0.05in{} B(20, 1-\varepsilon(h), 100\ehatbar-i) )
					\nonumber
\end{eqnarray}
\noindent
where $k$ is a normalizing constant that makes the integral of 
the probability distribution function equal 1, and
%				\left(\begin{array}{c}80\\i\end{array}\right) 
%			\varepsilon(h)^i (1-\varepsilon(h))^{80-i} \nonumber \\
%&&				\left(\begin{array}{c}20\\100\ehatbar-i\end{array}\right)
%				(1-\varepsilon(h))^{100\ehatbar-i} \nonumber\\
%&&				\varepsilon(h)^{20-(100\ehatbar-i)} \nonumber
%
% Original displaymath-style definition of B(.,.,.)
%\begin{displaymath}
%	B(n,p,x) = 
%		\left(\begin{array}{c}n\\x\end{array}\right) p^x (1-p)^{n-x} 
%		\nonumber	
%\end{displaymath}
%
$B(n,p,x) = \left(\hbox to 0pt {$^n$}_{x}\right) p^x (1-p)^{n-x}$
%\noindent
is the probability of a ${\rm Binomial}(n,p)$ distribution returning $x$
(so that $i$ in the summation is the number of apparent misclassifications
that are also true misclassifications). 
These posterior distributions 
of $\varepsilon(h)$ for $\ehatbar = 0.17$, $0.20$,\/ and $0.23$ were
evaluated numerically, and are shown in 
Figures~\ref{figure-posterior17}\/--\/\ref{figure-posterior23}.
From the figures, we see that if we picked a hypothesis 
with $\ehat(h)=0.17$ or $\ehat(h)=0.23$, we can
expect the hypothesis to have higher generalization error than one 
with $\ehat(h)=0.20$. It was, of course, expected that picking 
a hypothesis with test error $0.20$ beat picking one with $0.23$, 
but it was surprising that picking a hypothesis with a {\em higher} error 
on the test data ($\ehat(h)=0.20$ rather than $0.17$) could, on average,
result in a hypothesis with {\em lower} generalization error.

This reason that this was unexpected was that we expect hypotheses 
with higher generalization error to have higher test error.  
The point is that {\em the reverse is not necessarily true,}\/ and 
hypotheses with higher test error do not necessarily have higher
expected generalization error. 
That is, ${\rm low\ }\: \varepsilon(h) \implies 
	{\rm low\ }\: \E[\ehat(h)|\,\varepsilon(h)]$, 
but ${\rm low\ }\: \ehat(h) \not\implies 
	{\rm low\ }\: \E[\varepsilon(h)|\,\ehat(h)]$.

\subsubsection{Agreement between hypotheses}

Suppose that we were told that the noise rate $\eta$ in
the test data was 0.5, so that we are, from an information
standpoint, given only an unlabelled test sample
(i.e. only the $x_i$'s) as the $y_i$'s are contain no information. Then, 
informally, it seems reasonable to pick some hypothesis that somehow
``agrees'' with the other hypotheses on the classifications it makes, 
with the intention that if errors of the hypotheses are uncorrelated,
then some ``mean'' hypothesis may have lower expected
generalization error than one chosen randomly from the pool.

However, in picking the best--of--$n$ hypothesis, we are doing exactly
the opposite of this: We are picking the hypothesis that has the lowest
test error, and which therefore {\em maximally disagrees with all the other 
hypothesis on how many labels in the test sample are incorrect}. 

In our experimental results (not reported),
%in the decision--tree domain
%we describe in Section~\ref{section-experiment}, 
we found that, 
even with moderately high noise rates in the test data, this can 
led to picking hypotheses whose expected generalization errors are 
significantly higher than picking a hypothesis at random from the pool.
%which, with a sufficiently high $\eta$, in turn does worse than picking 
%the hypothesis with median test error $\ehat(h)$ (which may {\em not} 
%be a 50\%--error hypothesis).

This also suggests that another natural heuristic to use in hypothesis 
selection is to pick a hypothesis that agrees with the other hypothesis
maximally in some sense. However, we consider this issue to be 
somewhat orthogonal to our problem of hypothesis selection using 
test data, and shall not explore it further in this paper.


%\section{An Alternative to Choosing the ``Best''} 
\section{Not Choosing the ``Best''} 
	\label{section-alternative}

Because of this effect of ``overfitting'' of test data, 
``folklore'' suggests generating only a limited number 
of hypotheses using the black--box, and then picking the one 
that has the lowest test error $\ehat(h)$. The rationale is that
if we generate too few hypotheses, then we might not generate 
any good hypotheses at all; whereas if we generate too many, we 
increase the risk of generating poor hypotheses that fit the test 
data well ``by chance,'' and might overfit the test data.  So, 
the optimal number of hypotheses to generate should be between
these two extremes, and we shall call this number $\nopt$.

However, this algorithm seems sub-optimal at best. For example, 
it is not clear for most problems how $\nopt$ can be found. 
In addition, if we knew the value of $\nopt$, but have already 
generated more than $\nopt$ hypotheses, then it is unclear if 
we should throw-away all but $\nopt$ of them, or if there is 
some better way to use the extra hypotheses. Finally, we would 
also like an algorithm that asymptotically improves as it is 
given more hypotheses.

The problem with generating ``too many'' hypotheses was that those
with very low test errors were actually ``overfitting,'' and had high 
generalization error. Using the fact that, of any large set of 
hypotheses drawn from the fixed distribution of hypotheses,
the fraction of them that ``overfit'' should be approximately 
constant, we propose the following paradigm instead, which we
call {\em percentile-cv}:

%HERE
%[should probably either explain the name (cv), or call it something
%else.]

\begin{enumerate}
   \item Generate as many hypotheses as is computationally feasible
   \item Test them all on the test data, and sort them in descending 
		order of $\ehat(h_i)$
   \item For some $k$, pick the hypothesis in the $k$\/-th percentile 
		in the list of sorted 
		hypotheses. (i.e. if we have $n$ hypotheses, then 
		choose the one that was ranked 
		$\left\lceil kn/100 \right\rceil$ in the sorted list.) 
\end{enumerate}

Note that we have introduced a new variable, $k$, which we have not 
specified a way to choose yet, but also no longer need 
to guess the value of $\nopt$. 

The motivation behind this is that when we sort the hypotheses 
as in Step 2 above, we hope that the hypotheses are 
approximately sorted into the ``truly bad'' 
(high $\varepsilon$, high $\ehat$), followed by the ``good'' 
(low $\varepsilon$, low $\ehat$), followed by the ``bad, but 
overfit test data by chance'' (high $\varepsilon$, very low $\ehat$). 
Then, if we chose $k$ well, we would pick something between 
the ``truly bad'' and ``bad, but overfit test data by chance,'' 
and choose a good hypothesis. Note, however, that we will be 
{\em choosing a hypothesis with higher test error over some
others with lower test errors}.

Also, as the number of hypotheses generated increases and tends 
towards $\infty$, since the hypotheses are drawn from some fixed 
distribution, some fixed proportions of them will be ``truly bad,'' 
``good,'' and ``bad, but overfit,'' and so we will still be choosing 
a hypothesis that is ``good.''

So, for example, in the example of the simulated algorithm 
described in Section~\ref{section-simulated-algorithm}, we hope 
that $k$ will be chosen such that the $k$\/-th percentile hypothesis 
is one that has $\ehat(h)=0.20$.

We now present a result that further justifies using percentile-cv, 
which generates a large set of hypotheses and picks some $k$-th 
percentile hypothesis from it, instead of  
best--of--$n$, which generates some finite set of $n$ hypotheses 
and picks the one with the lowest test error from that set. 
The following theorem states 
that percentile-cv can always do at least well as best--of--$n$, 
and can therefore be considered a ``superset'' of best--of--$n$:

\begin{theorem}  \label{theorem-percentile-win}
$\exists k$ such that, $\forall n$, choosing the $k$-th percentile 
hypothesis from a (sufficiently) large set of hypotheses always gives 
an expected generalization error no higher than picking the 
``apparent best'' of $n$ randomly generated hypotheses. 
\end{theorem}
{\bf Proof (Sketch):} Picking the best--of--$n$ is equivalent
to the following in terms of performance: (1) Generating a set $H$ of $n$ 
hypotheses, noting down the lowest test error 
$\ehat_0 = {\rm min}_{h \in H}\; \ehat(h)$, and (2) generating fresh 
hypotheses until one has the same test error of $\ehat_0$, and 
outputting that hypothesis, which we shall call $h_{\rm f}$. 
%Now, step (1) chooses $\ehat_0$, and step (2) draws and outputs a 
%hypothesis from the distribution $D_H | \ehat(h) = \ehat_0$. 
Now, step (2) is simply drawing $h_{\rm f}$ from the conditional 
distribution $h \in D_H | \ehat(h)=\ehat_0$; and of the possible 
$\ehat_0$'s that could be chosen, one is ``best'' in terms of giving
the conditional distribution that minimizes 
${\rm E}[\varepsilon(h_{\rm f})| \ehat_0]$ during this step.
%which draws $h$ from the distribution $D_H | \ehat(h)=\ehat_0$. 
Call this $\ehat_{\rm opt}$.
Then, $\exists k$ such that the $k$-th percentile hypothesis
will have $\ehat(h) = \ehat_{\rm opt}$ with probability 1. So, $\forall n$,
$\exists k$ such that $k$-th percentile (not strictly) beats apparent 
best of $n$, so some optimal $k$ must beat all $n$, which is 
the statement of the theorem.

\medskip

\noindent
\begin{corollary}  \label{corollary-percentile-strict-win}
If ${\rm E}[\varepsilon(h_{\rm f})| \ehat_0]$ is not constant over 
all possible $\ehat_0$, $\exists k$ such that $\forall n$, the
expected generalization error of the $k$-th percentile hypothesis 
is strictly less than that of best--of--$n$ hypothesis.
\end{corollary}

\section{Choosing a Percentile}
	\label{section-choose-percentile}

Using the conventional hypothesis selection method, we 
had to estimate $\nopt$, the number of hypotheses to generate
before picking the one of this set that had the lowest test error. 
In this section, we propose an algorithm, 
{\em Leave-one-out Cross Validated Cross Validation} (LOOCVCV), 
that first finds an estimate $\nhat$ of $\nopt$, chooses 
$k$ from that, and finally applies the percentile-cv algorithm described 
in Section~\ref{section-alternative} with that $k$. We will 
first show how to choose $k$ from $\nopt$ or an estimate $\nhat$ 
of it, and then how to choose $\nhat$.

% HERE
%[Here: Explain why it's called LOOCVCV. Better yet, give it a better
% name!] 

\subsection{Choosing $k$ from $\nopt$}
	\label{subsection-k-from-nopt}

Since the hypotheses $h_i$\/'s are drawn from some fixed 
distribution, their test errors, $\ehat(h_i)$ are also 
drawn from some fixed distribution. Hence, if we 
generate $\nopt$ hypotheses and pick the one with the lowest test
error, then to a good approximation, we'd expect to pick a hypothesis 
whose test error just falls into the top $1/(\nopt+1)$ fraction of all
hypothesis drawn from our distribution of $\ehat$\/'s. 
We therefore propose using the following value of $k$\/:

\begin{equation}
  k = 100 \cdot (1- \frac{1}{\nopt+1})   \label{eqn-k-from-nopt}
\end{equation}

% HERE
%[Might be clearer if we got rid of the constant 100 everywhere,
% and made it fraction-cv or something instead. (?)]

Using notation from the proof of Theorem~\ref{theorem-percentile-win}, 
we can also think of picking the best--of--$n$ hypothesis as a way 
of choosing $\ehat_0$ (and then picking a hypothesis $h$ with 
$\ehat(h) = \ehat_0$). Now, suppose we already have a good 
estimate $\nhat$ of $\nopt$\/; then a natural alternative to 
picking the apparent best of $\nhat$ is to pick the $k$-th percentile 
hypothesis, where $k = 100(1-(1/(\nhat+1)))$, with the advantage 
of doing this being reducing the {\em variance} of $\ehat_0$, 
and hopefully the ``variance'' of the hypothesis we pick as well. 
Later, we will also give experimental results that demonstrate that
this can do significantly better than picking the best--of--$\hat{n}$.

\subsection{Estimating $\nopt$}

We will only sketch the main ideas of the algorithm here,
and will provide the details in the Appendix.
We shall use Leave-one-out Cross Validation (LOOCV) as follows: 
For each value of $\nhat$, we want a measure of 
the generalization error of the best--of--$\nhat$ 
hypothesis. So, for each sample $p_i \in S$, we shall ``leave-out'' 
$p_i$ and use $S_{-i}$ only 
to choose a hypothesis $\hstar_{i,\nhat}$ from a set of $\nhat$ randomly
generated hypothesis, and then test that hypothesis on 
$p_i$ and see if $\hstar_{i,\nhat}(x_i) \testeq y_i$. Then, the number of
values of $i$ for which $\hstar_{i,\nhat}(x_i) \neq y_i$, 
which we call the {\em LOOCV-error}, is a measure of the 
generalization error of the best--of--$\nhat$ hypothesis.

And by running LOOCV using different values of $\nhat$, we can get 
a measure of the LOOCV-error for each value of $\nhat$, and 
use the value of $\nhat$ that minimizes the LOOCV-error.


% HERE
%[Incidentally, doing LOOCV directly on the ``desired'' error doesn't 
%work, for not-so-obvious reasons. The main problem of this is
%that the estimate of the error of the hypothesis that got, say, 19 wrong,
%will be completely determined by the number of hypothesis that got 19
%wrong, and the number of hypothesis that got 20 wrong. Unless we do
%something clever, it'll also always pick the hypothesis that had the 
%{\em highest} test error (yes!). Maybe we can do LOOCV on the percentile, 
%though (emphasis on the ``Maybe''), and do something similar to the 
%bootstrap done in LOOCVCV (described below). But, the math doesn't really 
%work out as nicely as it does for LOOCVCV, and I'm not sure that
%we can really to do the ``simulate bootstrap'' thing (also described 
%below).]

The problem with the current formulation of this algorithm 
is that choosing the best--of--$\nhat$ hypothesis is a noisy process, as it
involves generating randomly chosen hypotheses, so the entire process
has to be repeated many times for each value of $\nhat$ to get estimates
of the LOOCV-error. This will, unfortunately, generally require 
generating an intractable number of hypotheses. So, instead, we shall
only generate a large but finite set $H$ of hypotheses, and choose uniformly 
with replacement from this set whenever we need to generate a hypothesis. 
(In the Statistics literature, this process is called
{\em non-parametric bootstrap} \cite{efron79}.)
However, even doing this, we found that we would still require an 
intractable number of repetitions to get a smooth curve for the 
LOOCV-error as a function of $\nhat$. Fortunately, there is
a way of calculating explicitly, from $\nhat$, $H$, and $S$, the LOOCV-error 
value we would get if we really repeated the process an infinite 
number of times, and this is what LOOCVCV does. (Details in appendix.)

So, for every value of $\nhat$, we now have a way of estimating 
the generalization error if we were to generate $\nhat$ hypotheses and 
pick the one of the $\nhat$ that minimizes $\ehat(h)$. We can now
use the value of $\nhat$ that minimizes LOOCV-error as our estimate of
$\nopt$, and use equation~\ref{eqn-k-from-nopt}, to derive $k$, our
desired percentile, which we can then use to apply percentile-cv. 
This completes our formulation of the LOOCVCV algorithm.


\section{Results Using the Simulated Learning Algorithm}
		\label{section-results-simulated-algo}	

We ask the reader to refer to Figure~\ref{figure-simulated-results},
which shows results of different model selection methods applied to
the Simulated Learning Algorithm 
described in Section~\ref{section-simulated-algorithm}.
The value of $n$ varies along the $x$-axis, and the top-most curve
is the expected generalization error $\varepsilon$ if we draw $n$
hypotheses and pick the one with the lowest test error. The middle,
step-like line is the expected generalization error if we picked the 
$k$\/-th percentile hypothesis, using $k = 100 (1- (1/n))$ as in 
Equation \ref{eqn-k-from-nopt}, using an infinitely large set of
hypotheses. Finally, the bottom horizontal line is the generalization 
error that LOOCVCV achieves.

% HERE
%[top curve: Actually generated by bootstrap on 60000 hypotheses. Should
%  be pretty accurate. middle: data from 2,000,000 simulated hypotheses. 
%  Should be very accurate.]

For this particular problem, $\nopt \approx 103$. Notice, however, that 
even if we chose $\nopt$ hypotheses and picked the one of them that had
the lowest test error, we would expect to get higher generalization
error of about 0.035, than if we picked the $k=100(1 - (1/(103+1))) \approx 99.0$\/-th 
percentile hypothesis from an ``infinitely'' large sample of hypotheses,
which gives expected generalization error of about 0.025, which is lower
by almost 30\% of the original error. The reason for 
this is that with the former method, the test error
$\ehat(h)$ of the hypotheses we choose in the end has non-zero variance,
which is better by our argument of Section~\ref{section-lower-not-imply}.
That is, $\E[\varepsilon(h) | \ehat(h) = \overline{\hat\varepsilon}]$ 
is minimized when $\ehatbar = 0.20$. However, in the process of
drawing $\nopt$ hypotheses and picking the one with lowest test error,
we may occasionally pick a hypothesis with other values of $\ehatbar$,
which is why its expected generalization error is higher. On the other
hand, in the infinitely large sample, the probability that the $99.0$\/-th
percentile hypothesis has $\ehat = 0.20$ is $1$, which is why picking
the $99.0$\/-th percentile hypothesis results in a lower expected
generalization error: There is less variance in the test error of 
the hypothesis we pick.

Similarly, note too that, for most values of the curve, LOOCVCV 
(the middle curve) beats picking the apparent best of $n$ 
(the top curve). Again, we argue that this is due to the same
reason: LOOCVCV results in lower variance in the test error
of the chosen hypothesis. This is further justification for our 
earlier argument that, even if we did not want to apply LOOCVCV, but 
had an estimate $\nhat$ of $\nopt$, then we may be better 
picking the $100(1-(1/(\nhat+1)))$\/-th percentile hypothesis than 
the best--of--$\nhat$ hypothesis.

Finally, we note that when LOOCVCV is given an ``infinitely''
large set of hypotheses for this problem, it will always achieve an 
expected generalization error of about 0.025, which is provably
the best that any algorithm using percentile-cv can achieve on this problem.

\section{Experimental Results}
	\label{section-experiment}

To examine a slightly more realistic hypothesis selection problem, 
we considered a small learning task involving decision trees. The target
function was a boolean function over binary attributes $x_0$ to 
$x_4$, with $f(x_0,x_1,x_2,x_3,x_4) = (x_0 \wedge x_1 \wedge x_2) \vee 
(x_3 \wedge x_4)$. The input distribution $D$ was uniform over the 
instance space of size 32, 
and $n = 1000$ hypothesis were generated to form the set $H$.
Finally, each hypothesis was generated by drawing $20$ samples 
from $D$ with their correct labels, and running an ID3-like decision 
tree algorithm using a greedy information-gain heuristic to choose 
splits \cite{quinlan86}. (Of course, no practitioner would ever see 
such a distribution of hypothesis; but, we are interested only in 
the problem of selecting 
a hypothesis out of a set, and this was a convenient way of 
creating such a set of hypotheses.) 

All experimental results reported in this section are the averages of 
400 runs. First, using $m = 20$ test samples, the results for different 
noise rates $\eta$ in the test data are
in Figure~\ref{figure-xy20}. For noise rates
of about $\eta=0.1$ and higher, picking the best--of--1000 hypotheses
loses to LOOCVCV. Note too that picking the apparent
best could potentially do even worse than these results suggest,
if we had generated more than the 1000 hypotheses we had 
restricted ourselves to for computational reasons.

One might think that this effect would go away with a larger
test sample. However, it was still significant when the size
of the test sample is increased to 40 (and note that the size of the 
instance space is 32), keeping everything else the same as before.
(Figure~\ref{figure-xy40}) Also, we tried 
the varying the test-sample size, across the larger range of 
5 to 80, using a fixed noise rate $\eta$ of 0.3, and the results 
(Figure~\ref{figure-xytest}) also shows LOOCVCV doing better consistently.

Next, to allow us to perform out certain calculations explicitly such 
as finding the true $\nopt$, we carried out the next few 
experiments by first drawing 1000 hypothesis from $D$. Then, to generate 
each hypothesis afterward, we sampled with 
replacement from the pool of 1000 each time, instead of from $D$. 
(We also verified that these results are close to if we had used $D$.)

Recall that in Section~\ref{subsection-k-from-nopt}, we claimed that
if we had an estimate of a good $\nhat$ to use in picking best--of--$\nhat$,
we might instead wish to use the $k$-th percentile hypothesis instead,
where $k = 100 (1- (1/\nhat))$. Using $m=20$ test samples again, the top 
line of Figure~\ref{figure-xyopt} 
is the expected error if we knew exactly what the best $n$, $\nopt$ 
were and chose the best--of--$\nopt$. The middle curve is the error
of the $k$-th percentile hypothesis, where $k = 100(1-(1/ \nopt))$, and
we see that it consistently beats best--of--$\nopt$. Finally, the 
lowermost curve is if we knew the optimal $k$, $k_{\it opt}$, and chose
the $k_{\it opt}$-th percentile hypothesis, which, as predicted by
Theorem~\ref{theorem-percentile-win}, beats best--of--$\nopt$ also.

\section{Discussion}

From the results we have presented so far, it may appear 
that we should always use LOOCVCV for hypothesis 
selection, even when we do not expect the hypotheses to 
``overfit'' the test data. However, this is not necessarily so.

Let $\kopt$ be the value of $k$ that minimizes the expected
generalization error of the $k$-th percentile hypothesis.
Then, we can think of LOOCVCV as trying to estimate $\kopt$. 
Also, choosing the hypothesis with the lowest test error from $H$
is like approximating $\kopt$ with
$\kopthat = 100$, which while almost definitely a biased estimator 
of $\kopt$, is constant and so has no variance.
On the other hand, LOOCVCV's estimate of $\kopt$ is a function of the 
noisy test data, and has non-zero variance. So, even though 
it may be less biased than $\kopthat = 100$, it might be a worse
estimate of $\kopt$, because of its higher variance. In particular,
with noise rate $\eta = 0$, LOOCVCV generally loses
to best--of--$n$ (Figures~\ref{figure-xy20} and~\ref{figure-xy40}). Also,
some preliminary results (not reported) involving 
artificial neural networks trained by backpropagation on a fixed set of
training data with randomly re-initialized weights each time also
showed LOOCVCV losing slightly to best--of--$n$.

Leaving considerations of a loss function of $k$ aside, 
we might therefore decide not
to use LOOCVCV unless we have reason to believe that the ``overfitting''
problem is significant in the distribution of hypothesis in our
hypothesis selection problem, or that LOOCVCV will find a good
$\kopthat$ (such as when the test sample is large).
However, if we do have an exceedingly large number of hypotheses to choose 
from, and have a reasonably large test-set, then LOOCVCV may give
much better performance than simply selecting the hypothesis with the 
lowest test error.

%HERE
%[mention MDL alternative?]

\section{Conclusions} 

This paper explained how overfitting of test data occurs, and 
presented percentile-cv and LOOCVCV to try to correct this 
problem. It also proved that percentile-cv with an appropiately 
chosen percentile will always do at least as well as best--of--$n$, 
and demonstrated experimentally that LOOCVCV consistently beats 
best--of--$n$ in one domain. However, there are also domains
where overfitting of test data is not a significant problem, in which 
LOOCVCV loses slightly to best--of--$n$; the characterization of
such domains is an open problem.

% START-OBSERVATIONS
%\section{Other Random Stuff} 

%\begin{observation} 
%If $\E[\varepsilon(h)|\,\ehat(h)]$ as a function of $\ehat(h)$ is convex,
%and $\ehatbar$ is the expected $\ehat(h)$ under a certain algorithm (such
%as ``best of $\nopt$ hypotheses''), then we can expect to always do at 
%least as well or better by picking a hypothesis with $\ehat(h) = \ehatbar$
%(if one exists).
%\end{observation}
%\Proof By Jensen's inequality. \hb
%However, there may not be any such hypothesis. (i.e. if 100 test samples, and
%$\ehatbar = 0.203$.) So, maybe we can pick the nearest one (0.20 in this 
%case)? But that won't guarantee always doing better. So, as an alternative to
%LOOCVCV, how about somehow choosing randomly from the two nearest ones (0.20 and 0.21) 
%instead? I think this can be made to always do at least as well/better. 
%Empirically, however, percentile-cv almost always chooses something very 
%similar to this. (That is, given n, if we look at 
%$E[\ehat({\mbox{\rm best of n hypotheses}})]$
%and the $\ehat$ that percentile-cv chooses using $k$ as in Equation~\ref{eqn-k-from-nopt},
%they will almost always be very close.) So, does this serve as another justification for 
%percentile-cv? (Incidentally, percentile-cv does {\em not} always beat 
%``best of $n$'', using $k$ and $n$ related as in 
%Equation~\ref{eqn-k-from-nopt}.)
%\hb
%
%
%\begin{observation} 
%From Figure~\ref{figure-simulated-results}, our loss if we overestimate $\nopt$ is
%much smaller than if we underestimated it by the same amount. 
%\end{observation}
%Maybe if we can prove that it'll always be so, using only simple assumptions.
%Anyway, might be worth mentioning somewhere, since if we aren't sure what $\nopt$ is, it's
%better to ``guess high'' than to ``guess low.'' Also, should add something
%about ``guessing high'' into LOOCVCV?
%
%\begin{observation} 
%Yet another advantage of LOOCVCV: It allows things like ``$\nopt = 3.5$,'' where
%it'll simply figure $k$ out from that, and pick a hypothesis.
%\end{observation}
%(Incidentally, the mathematical formulas used in LOOCVCV have a natural way of 
%allowed $n$ to be treated as a continuous real variable, rather than as an integer.
%So, this is how we might get non-integral $\nopt$ from it.)

%END-OBSERVATIONS

\section*{Acknowledgments}

(blah)
%I give warm thanks to Michael Kearns for many interesting and helpful 
%discussions, and Avrim Blum for his input on writing this paper.

\bibliographystyle{plain}
\bibliography{cv-bib}

\section*{Appendix: The LOOCVCV Algorithm}

%\noindent
%\subsection*{The LOOCVCV Algorithm}
We ask the reader to refer to Section~\ref{section-choose-percentile}
for the details of the LOOCVCV algorithm. Here, we describe how it 
finds an estimate $\hat{n}$ of $\nopt$ using $H$, from which we will 
derive $k$, for picking the $k$-th percentile hypothesis.
%(In practice, 
%we pick the $k$-th percentile hypothesis within $H$, rather than
%drawing a fresh set.) 
\hb
{\bf Input:} $\hat{n}$, $H[1..n]$, $S$, $i$ 	\hb
{\bf Output:} leave-one-out error with $p_i$ ``left-out,''
		for best of $\hat{n}$ hypothesis drawn from $H$	\hb
{\bf Notes:} The idea of this function is sorting $H$ in order of leave-one-out
	$\ehat$'s, and calculating, for each $h \in H$, the {\em probability}
	that it will be the lowest--test--error hypothesis in a set of 
	$\nhat$ drawn from $H$ (step 11 below).  \hb
%Also, $I$ is the indicator
%	function, so $I(A) = 1$ if $A=$ True, 0 otherwise.\hb
\noindent
\def\err {{\rm err}}
\def\ebar {\overline{\rm e}}
\def\empt {\hbox to 0.0in{}}
1.  function loocv-error($\hat{n}$, $H[1..n]$, $S$, $i$) 	\hb
2.  \ \ \ \ \ create $H'[1..n].\{\ehat_{-1},e\}$		\hb
3.  \ \ \ \ \ for ($j = 1 {\rm \ to\ } n$) do begin 		\hb
4.  \ \ \ \ \ \ \ \ \ $H'[j].\ehat_{-1} = 			\hb
\empt \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ | \{H[j](x_k) \neq y_k | 
					(x_k, y_k) \in S_{-i}\} |$ \hb
%5.  \ \ \ \ \ \ \ \ \ $H'[j].e = I\{H[j](x_i) \neq y_i\}$ 	\hb
5.  \ \ \ \ \ \ \ \ \ $H'[j].e = 1$ if $H[j](x_i) \neq y_i$, $0$ o.w. 	\hb
6.  \ \ \ \ \ end 						\hb
7.  \ \ \ \ \ Sort $H'[]$ in ascending $H'.[].\ehat_{-1} $	\hb
8.  \ \ \ \ \ for each value x of $H[j].\ehat_{-1}$, 		\hb
\empt \ \ \ \ \ \ \ \ \ \ \ let $\ebar(x)$ be the mean of the $H[].e$'s \hb
\empt \ \ \ \ \ \ \ \ \ \ \ that have $H[].\ehat_{-1} = x$,		\hb
9.  \ \ \ \ \ for ($j = 1 {\rm \ to\ } n$) do			\hb
10. \ \ \ \ \ \ \ \ \ $H'[j].e \leftarrow \ebar(H'[j].\ehat_{-i})$ \hb
11. return $\sum_{j=1}^{n} ((n-j+1)^{\hat{n}} - (n-j)^{\hat{n}}) H`[j].e$\hb
\hb
{\bf Input:} $H$, $S$						\hb
{\bf Output:} Recommended $\hat{n}$				\hb
12. function loocvcv($H$,$S$) \hb
13. return 							\hb
\empt \ \ \ \ \ \ ${\rm argmin}_{\nhat \in [1,\infty)} 
		\{ \sum_{i=1}^m \hbox{{\rm loocv-error}}(\nhat, H, S, i)$\} \hb

%\psfig{figure=posterior.15.ps,width=3.25in,angle=270}

\begin{figure}[h]
  \psfig{figure=posterior.new.17.ps,width=3.25in,angle=270}
  \caption{posterior distribution for $\varepsilon(h) | \ehat(h) = 0.17$.
	    $\E[\varepsilon(h) | \ehat(h) = 0.17] \approx 0.0648$.
           See Section~\protect\ref{section-simulated-algorithm} for details.}
  \label{figure-posterior17}
\end{figure}

\begin{figure}[h]
  \psfig{figure=posterior.new.20.ps,width=3.25in,angle=270}
  \caption{posterior distribution for $\varepsilon(h) | \ehat(h) = 0.20$.
	    $\E[\varepsilon(h) | \ehat(h) = 0.20] \approx 0.0253$.
           See Section~\protect\ref{section-simulated-algorithm} for details.}
  \label{figure-posterior20}
\end{figure}

\begin{figure}[h]
  \psfig{figure=posterior.new.23.ps,width=3.25in,angle=270}
  \caption{posterior distribution for $\varepsilon(h) | \ehat(h) = 0.23$.
	    $\E[\varepsilon(h) | \ehat(h) = 0.23] \approx 0.0727$.
           See Section~\protect\ref{section-simulated-algorithm} for details.}
  \label{figure-posterior23}
\end{figure}

\begin{figure}[h]
  \psfig{figure=num60000.com.ps,width=3.25in,angle=270}
  \caption{$n$ varies along the $x$-axis, and the top-most curve
            the expected generalization error if we draw $n$ hypotheses 
            and pick the one with the lowest test error. The middle,
            step-like line is the expected error if we picked the 
            $k$\/-th percentile hypotheses, using $k = 100 (1- (1/n))$.
	    The bottom line is the generalization error of LOOCVCV achieves.}
  \label{figure-simulated-results}
\end{figure}

\begin{figure}[h]
  \psfig{figure=/afs/andrew/usr/an2i/learn-5/postscripts/xy20.ps0,width=3.25in,angle=270}
  \caption{Noise in test data on $x$-axis, generalization error on $y$-axis.
	m=20. Vertical dashes show 95\% confidence intervals for means. }
  \label{figure-xy20}
\end{figure}

\begin{figure}[h]
  \psfig{figure=/afs/andrew/usr/an2i/learn-5/postscripts/xy40.ps0,width=3.25in,angle=270}
  \caption{Noise in test data on $x$-axis, generalization error on $y$-axis.
	m=40. Vertical dashes show 95\% confidence intervals for means. }
  \label{figure-xy40}
\end{figure}

\begin{figure}[h]
  \psfig{figure=/afs/andrew/usr/an2i/learn-5/postscripts/xytest.ps0,width=3.25in,angle=270}
  \caption{m (number of test-data samples, log scale) on $x$-axis,
	generalization error on $y$-axis.
	Uppermost curve is ``apparent best'' of 1000, lower curve is LOOCVCV.}
  \label{figure-xytest}
\end{figure}

\begin{figure}[h]
  \psfig{figure=/afs/andrew/usr/an2i/learn-5/postscripts/xyoptint.ps0,width=3.25in,angle=270}
  \caption{Noise in test data on $x$-axis, generalization error on $y$-axis.
	Uppermost curve is best--of--$\nopt$, middle curve is the
	$k$-th percentile hypothesis, where $k$ was derived from $\nopt$ as 
	in (\protect\ref{eqn-k-from-nopt}), bottom curve is $k_{\it opt}$-th percentile 
	hypothesis where $k_{\it opt}$ is the optimal percentile.}
  \label{figure-xyopt}
\end{figure}

\newpage

%\bibliographystyle{plain}
%\bibliography{cv-bib}

\end{document}
