%% LyX 1.3 created this file.  For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[english]{amsart}
\usepackage[T1]{fontenc}
\usepackage{graphicx}

\makeatletter

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
%% Because html converters don't know tabularnewline
\providecommand{\tabularnewline}{\\}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass specific LaTeX commands.
 \theoremstyle{plain}    
 \newtheorem{thm}{Theorem}[section]
 \numberwithin{equation}{section} %% Comment out for sequentially-numbered
 \numberwithin{figure}{section} %% Comment out for sequentially-numbered
 \theoremstyle{plain}
 \theoremstyle{definition}
  \newtheorem{example}[thm]{Example}
 \theoremstyle{definition}
 \newtheorem{defn}[thm]{Definition}
 \theoremstyle{definition}
  \newtheorem*{example*}{Example}
 \theoremstyle{plain}    
 \newtheorem{cor}[thm]{Corollary} %%Delete [thm] to re-start numbering
 \theoremstyle{plain}    
 \newtheorem{lem}[thm]{Lemma} %%Delete [thm] to re-start numbering

\usepackage{babel}
\makeatother
\begin{document}

\title{Tutorial on Practical Sample Complexity for Classification}


\author{John Langford, IBM Research}

\begin{abstract}
We discuss basic sample complexity theory and it's impact on classification
success evaluation, implications for learning algorithm design, and
uses in learning algorithm execution. This tutorial is meant to be
a comprehensive compilation of all results which are theoretically
rigorous and practically useful. 

There are a two important implications of the results presented here: 

(1) Common practices for reporting results in classification should
change to use the test set bound.

(2) Train set bounds can sometimes be used to directly motivate learning
algorithms. 
\end{abstract}
\maketitle

\section{Introduction}

Classifiers are functions which partition a set into two elements
(the set of rainy days and the set of sunny days). Classifiers are
the most simple nontrivial decision making element so studying the
theory of learning classifiers is very fundamental to studying the
theory of learning, in general. Classifiers are sufficiently complex
that many phenomena observed in machine learning (theoretically or
experimentally) can be observed in the classification setting. Yet,
classifiers are simple enough to make their analysis easy to understand.
This combination of sufficient complexity to capture phenomena and
maximal simplicity makes the study of classifiers especially fruitful. 

The goal of this paper is an introduction to the theory of sample
complexity for classification. Many of these results have been presented
elsewhere, although the style and manner of presentation may be new.
This is a tutorial, so we limit our presentation to those results
which are both theoretically sound and practically useful.

There are several important aspects of learning which the theory here
casts light on. Perhaps the most important of these is the problem
of performance reporting for classifiers. Many people use some form
of empirical variance to estimate upper and lower bounds. This is
an error-prone practice, and the test set bound in section \ref{sec-test}
implies a better method by nearly any metric. Hopefully, this will
become common practice. 

After discussing the test set bound we cover the Occam's Razor bound,
the simplest train set bound, which explains (and quantifies) the
common phenomena of overfitting in simplest form. We also prove that
the Occam's Razor bound can not be improved by any constant factor
and apply the bound to decision trees.

Next, we discuss two train set bounds, the PAC-Bayes bound and the
Sample Compression bound, which have proved to give practical results
for more general classifiers, such as Support Vector Machines and
Neural Networks.

All of the results here should be easily approachable and understandable.
The proofs are simple, and examples are given. Pointers to related
work are also given. 

It is important to note that all of the results presented here fall
in the realm of classical statistics. In particular, all randomizations
are over draws of the data, and our results have the form of confidence
intervals. 

The layout of this document is as follows:

\begin{itemize}
\item Section \ref{sec-formal} presents the formal model 
\item Section \ref{sec-test} presents the test set bound
\item Section \ref{sec-train} presents the Occam's Razor bound
\item Section \ref{sec:PAC-Bayes-Bound} presents the PAC-Bayes bound
\item Section \ref{sec:Sparsity-Bound} presents the sparsity bound
\end{itemize}
The formal model and test set bound must be understood in order to
appreciate all later results. There is no particular dependency between
the various train set bounds we present.


\section{Formal Model}

\label{sec-formal}

There are many somewhat arbitrary choices of learning model. The one
we use can (at best) be motivated by it's simplicity. Other models
such as the online learning model \cite{Online}, PAC learning \cite{Valiant},
and the uniform convergence model \cite{Vapnik} differ in formulation,
generality, and in the scope of addressable questions. The strongest
motivation for studying the sample complexity model here is simplicity
and corresponding generality of results. Appendix section \ref{sec-model}
discusses the connections between various models.


\subsection{Basic quantities}

We are concerned with a learning model in which examples of (input,
output) pairs come independently from some unknown distribution. The
goal is to find a function capable of predicting the output given
the input. There are several mathematical objects we work with.

\vspace{0.3cm}
\begin{center}\begin{tabular}{|c|c|}
\hline 
Object&
Description\tabularnewline
\hline
\hline 
$X$&
The (arbitrary) space of the input to a classifier\tabularnewline
\hline 
$Y=\{0,1\}$&
The output of a classification.\tabularnewline
\hline 
$D$&
An (unknown) distribution over $X\times Y$\tabularnewline
\hline 
$S$&
A set of examples drawn independently from $D$.\tabularnewline
\hline 
$m$&
$=|S|$ the number of examples\tabularnewline
\hline 
$c$&
A function mapping $X$ to $Y$\tabularnewline
\hline
\end{tabular}\end{center}
\vspace{0.3cm}

There are several aberrations of this model from other (perhaps more
familiar) models. There is no mention of a classifier space, because
the results do not depend upon a classifier space. Also, the notion
of a distribution on $X\times Y$ is strictly more general than the
{}``target concept'' model which assumes that there exists some
function $f:X\rightarrow Y$ used to generate the label \cite{Valiant}.
In particular we can model probabilistic learning problems which do
not have a particular $Y$ value for each $X$ value. This generalization
is essentially {}``free'' in the sense that it does not add to the
complexity of presenting the results.

It is worth noting that the \emph{only} assumption we make is the
ability to draw independent examples from $D$. The strength of all
the results which follow rests upon the correctness of this assumption. 

Sometimes, we decorate these objects with labels like $S_{\textrm{train}}$
(a train set) or $S_{\textrm{test}}$ (a test set). These decorations
should always be clear. 

\begin{example}
Weather prediction: Will it rain today or not? In this case $X=$
barometric pressure, observations of cloud cover or other sensory
input and $Y=0$ if the prediction is {}``no rain'' and $1$ otherwise.
The distribution $D$ is over sensory inputs and outcomes. The sample
set $S$, might consist of $m=100$ (observation, outcome) pairs such
as (pressure low, cloudy, rain), (pressure high, cloudy, not rain),
etc. A classifier, $c$, is any function which predicts {}``rain''
or {}``not rain'' based upon the observation.

Note that the independence assumption here is not perfectly satisfied
although it seems to be a reasonable approximation. In any application
of this theory, it must be carefully judged whether the independence
assumption holds or not. 
\end{example}

\subsection{Derived quantities}

There are several derived quantities which the results are stated
in terms of. 

\begin{defn}
(True Error) The true error $c_{D}$ of a classifier $c$ is defined
as the probability that the classifier errs:\[
c_{D}\equiv\Pr_{(x,y)\sim D}(c(x)\neq y)\]

\end{defn}
The true error is sometimes called the {}``generalization error''.
Unfortunately, the true error is not an observable quantity in our
model because the distribution, $D$, is unknown. However, there is
a related quantity which is observable. 

\begin{defn}
(Empirical Error) Given a sample set $S$, the \emph{empirical error},
$\hat{c}_{S}$ is the observed rate of errors:\[
\hat{c}_{S}\equiv\Pr_{(x,y)\sim U(S)}(c(x)\neq y)=\frac{1}{m}\sum_{i=1}^{m}I(c(x_{i})\neq y_{i})\]
 where $I()$ is a function which maps {}``true'' to $1$ and {}``false''
to $0$. Also, $\Pr_{U(S)}(...)$ is a probability taken with respect
to the uniform distribution over the set of examples, $S$.
\end{defn}
The empirical error is sometimes called the {}``training error'',
{}``test error'', or {}``observed error'' depending on whether
it is the error rate on a trainig set, test set, or a more general
set.

\begin{example*}
(continued) The classifier $c$ which always predicts {}``not rain''
might have an empirical error of $\frac{38}{100}$ and an unknown
true error rate (which might in fact be $0.5$). 
\end{example*}

\subsection{Addressable questions}

Given the true error, $c_{D}$ of a classifier $c$ we can precisely
describe the distribution of success and failure on future examples
drawn according to $D$. This quantity is derived from the unknown
distribution $D$, so our effort is directed toward upper and lower
bounding the value of $c_{D}$ for a classifier $c$. 

The variations in all of the bounds that we present are related to
the method of choosing a classifier, $c$. We cover two types of bounds:

\begin{enumerate}
\item Test: Use examples in a test set which were not used in picking $c$. 
\item Train: Use examples for both choosing $c$ and evaluating $c$. 
\end{enumerate}
These methods are addressed in the next two sections.

It is worth noting that one question that \emph{cannot} be addressed
in this model is {}``Can learning occur for my problem?''. Extra
assumptions (as in \cite{Valiant} \cite{Vapnik}) are inherently
necessary. 


\section{The Test Set Method}

\label{sec-test}

The simplest bound arises for the classical technique of using $m$
fresh examples to evaluate a classifier. This section is organized
into two subsections:

\begin{itemize}
\item Subsection \ref{subsec-test-bound} Presents the basic upper bound
on the true error rate, handy approximations, and a lower bound
\item Subsection \ref{subsec-test-implication} Discusses the implications
of the test set bound on error reporting practice. A better method
for error reporting is applied to several datasets and the results
are shown.
\end{itemize}

\subsection{The Bound}

\label{subsec-test-bound}

Before stating the bound, we note a few basic observations which make
the results less surprising.%
\begin{figure}
\begin{center}\includegraphics[%
  angle=270]{single_binomial.ps}\end{center}


\caption{\label{fig-binom} A depiction of the Binomial distribution. The
cumulative of the Binomial is the area under the curve up to some
point on the horizontal axis.}
\end{figure}
The principle observable quantity is the empirical error ($\hat{c}_{S}$)
of a classifier. What is the distribution of the empirical error for
a fixed classifier? For each example, our independence assumption
implies the probability that the classifier errs is given by the true
error, $c_{D}$. This can be modeled by a biased coin flip: heads
if you are right and tails if you are wrong. 

Let us call the bias of the coin $p\equiv c_{D}$. What is the probability
of observing $k$ heads out of $m$ coin flips? This is a very familiar
distribution in statistics called the Binomial and so it should be
unsurprising that the bounds presented here are fundamentally dependent
upon the cumulative distribution of a Binomial. 

\begin{defn}
(Binomial Tail Distribution) \[
\textrm{Bin}\left(\frac{k}{m},p\right)\equiv\Pr_{X_{1},...X_{m}\sim p^{m}}\left(\sum_{i=1}^{m}X_{i}\leq k|p\right)=\sum_{j=0}^{k}\binom{m}{j}p^{j}(1-p)^{m-j}\]
= the probability that $m$ coins with bias $p$ produce $k$ or fewer
heads.

A depiction of the Binomial distribution is given in figure \ref{fig-binom}.
\end{defn}
For the learning problem, we always choose $p\equiv c_{D}$ and $X_{i}=$error
on the $i$th example. With these definitions, we can interpret the
Binomial tail as the probability of an empirical error greater than
or equal to $\frac{k}{m}$. 

Since we are interested in calculating a bound on the true error given
a confidence, $\delta$ and an empirical error $\hat{a}_{S}(c)$ it
is handy to define the inversion of a Binomial tail.

\begin{defn}
(Binomial Tail Inversion) \[
\overline{\textrm{Bin}}\left(\frac{k}{m},\delta\right)\equiv\max_{p}\left\{ p:\,\,\textrm{Bin}\left(\frac{k}{m},p\right)\geq\delta\right\} \]
= the largest true error such that the probability of observing $\frac{k}{m}$
or more {}``heads'' is at least $\delta$.
\end{defn}
With these definitions finished, the results are all very simple statements. 

\begin{thm}
\label{th-hb}(Test Set Sample Complexity) For all classifiers, $c$,
for all $\delta\in(0,1]$\[
\Pr_{S\sim D^{m}}\left(c_{D}\leq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta\right)\right)\geq1-\delta\]

\end{thm}
\begin{proof}
(pictorially in \ref{fig-proof}) The proof is just a simple identification
with the Binomial. For any distribution over $(x,y)$ pairs and any
classifier, $c$, there exists some probability, $c_{D}$, that the
classifier predicts correctly. We can regard this event as a coin
flip with bias $c_{D}$. Since each example is picked independently,
the distribution of the empirical error is a Binomial distribution.
Assuming (correctly with probability $1-\delta$) that the empirical
error is not in the Binomial tail, we can constrain (and therefore
bound) the value of the true error. 
\end{proof}
%
\begin{figure}
\includegraphics[%
  width=0.35\columnwidth,
  angle=270]{binomials.ps}\includegraphics[%
  width=0.35\columnwidth,
  angle=270]{cut_binomials.ps}

\includegraphics[%
  width=0.35\columnwidth,
  angle=270]{consistent_cut_binomials.ps}\includegraphics[%
  width=0.35\columnwidth,
  angle=270]{worst_cut.ps}


\caption{\label{fig-proof} A graphical depiction of the test set bound. The
first graph depicts several possible Binomials given their true error
rates. The second depicts several Binomials, each with a tail cut.
The third figure shows the Binomials consistent with the tail cut
and observed test error. The worst case over all true error rates
is the consistent Binomial with the largest bias. }
\end{figure}
 

The test set sample complexity bound is, essentially, perfectly tight.
For any classifier with a sufficiently large true error, the bound
is violated exactly a $\delta$ portion of the time. 


\subsubsection{Corollaries}

\label{sub:Corollaries}

There are several immediate corollaries of the holdout theorem (\ref{th-hb})
which are more convenient when a computer is not handy. The first
corollary applies to the limited {}``realizable'' setting where
you happen to observe $0$ test errors.

\begin{cor}
(Realizable Holdout Sample Complexity) For all classifiers, $c$,
for all $\delta\in(0,1]$\[
\Pr_{S\sim D^{m}}\left(\hat{c}_{S}=0\Rightarrow c_{D}\leq\frac{\ln\frac{1}{\delta}}{m}\right)\geq1-\delta\]

\end{cor}
\begin{proof}
Specializing theorem \ref{th-hb} to the zero empirical error case,
we get:\[
\textrm{Bin}\left(\frac{0}{m},\epsilon\right)=(1-\epsilon)^{m}\leq e^{-\epsilon m}\]
Setting this equal to $\delta$ and solving for $\epsilon$ gives
us the result.
\end{proof}
A second corollary applies to all results, not just those where we
observe perfect true error.

\begin{cor}
(Agnostic Holdout Sample Complexity) For all classifiers, $c$, for
all $\delta\in(0,1]$\[
\Pr_{S\sim D^{m}}\left(c_{D}\leq\hat{c}_{S}+\sqrt{\frac{\ln\frac{1}{\delta}}{2m}}\right)\geq1-\delta\]

\end{cor}
\begin{proof}
Loosening theorem \ref{th-hb} with the Hoeffding approximation for
$\frac{k}{m}<c_{D}$ we get:\[
\textrm{Bin}\left(\frac{k}{m},c_{D}\right)\leq e^{-2m(c_{D}-\frac{k}{m})^{2}}\]
 setting this equal to $\delta$, and solving for $\epsilon$ gives
the result.
\end{proof}
There has been some confusion elsewhere caused by differences between
results in the realizable and the agnostic case. The differences are
fundamental, and are related to the decrease in the variance of a
Binomial as the bias (i.e. true error) approaches $1$. Note that
this implies using the exact Binomial tail calculation can result
in \emph{functional} (rather than merely constant) improvements on
the above corollary. We can state a corollary incorporating both regimes
using the Chernoff approximation.

\begin{cor}
(Agnostic Holdout Sample Complexity II) For all classifiers, $c$,
for all $\delta\in(0,1]$\[
\Pr_{S\sim D^{m}}\left(\textrm{KL}\left(\hat{c}_{S}||c_{D}\right)\leq\frac{\ln\frac{1}{\delta}}{m}\right)\geq1-\delta\]
where $\textrm{KL}(q||p)=q\ln\frac{q}{p}+(1-q)\ln\frac{1-q}{1-p}$
is the Kullback-Leibler divergence between two coins of bias $q,p$
with $q<p$.
\end{cor}
\begin{proof}
Loosening theorem \ref{th-hb} with the Chernoff approximation for
$\frac{k}{m}<c_{D}$ we get:\[
\textrm{Bin}\left(\frac{k}{m},c_{D}\right)\leq e^{-m\textrm{KL}\left(\frac{k}{m}||c_{D}\right)}\]
 setting this equal to $\delta$, and solving for $\epsilon$ gives
the result.
\end{proof}

\subsubsection{A Test Set Lower Bound}

The true error can be lower bounded using a symmetric application
of the same techniques.

\begin{thm}
\label{th-lb}(Test Set Lower Bound) For all classifiers, $c$, for
all $\delta\in(0,1]$\[
\Pr_{S\sim D^{m}}\left(c_{D}\geq\min_{p}\left\{ p:\,\,1-\textrm{Bin}\left(\hat{c}_{S},p\right)\geq\delta\right\} \right)\geq1-\delta\]

\end{thm}
The proof is completely symmetric. Note that both bounds hold with
probability $1-2\delta$ since $\Pr(A\textrm{ or }B)\leq\Pr(A)+\Pr(B)$.
This is particularly convenient if the Hoeffding approximation is
used in both directions to get: \[
\forall c\,\,\Pr_{S\sim D^{m}}\left(|c_{D}-\hat{c}_{S}|\leq\sqrt{\frac{\ln\frac{2}{\delta}}{2m}}\right)\geq1-\delta\]


\begin{example*}
(continued) let $\delta=0.1$. Using the the Hoeffding bound with
$\hat{e}_{s}(c)=\frac{38}{100}$, we get the high confidence interval
$c_{D}\in[0.26,0.50]$. Using an exact calculation for the Binomial
tail, we get: $c_{D}\in[0.30,0.47]$. In general, as the observed
error moves toward $0$, the exact calculation becomes much tighter
than the Hoeffding approximation. 
\end{example*}

\subsubsection{The state of the art}

Although the test set bound is very well understood, the same can
not be said of other testing methods. Only weak general results in
this model are known for some variants of cross validation (see \cite{Progressive}).
For specific learning algorithms (such as nearest neighbor), stronger
results are known (see \cite{Devroye}). There are a wide range of
essentially unanalyzed methods and a successful analysis seems particularly
tricky although very worthwhile if completed. 


\subsection{Test Set Bound Implications}

\label{subsec-test-implication}

There are some common practices in machine learning which can be improved
by application of the test set bound. When attempting to calculate
a confidence interval on the true error rate given the holdout set,
many people follow a standard statistical prescription:

\begin{enumerate}
\item Calculate the empirical mean $\hat{\mu}=\hat{c}_{S_{\textrm{test}}}=\frac{1}{m}\sum_{i=1}^{m}I(h(x_{i})\neq y_{i})$.
\item Calculate the empirical variance $\hat{\sigma}^{2}=\frac{1}{m-1}\sum_{i=1}^{m}(I(c(x_{i})=y_{i})-\hat{\mu})^{2}$.
\item Pretend that the distribution is Gaussian with the above variance
and construct a confidence interval by cutting the tails of the Gaussian
cumulative distribution at the $2\hat{\sigma}$ (or some other) point.
\end{enumerate}
This approach is motivated by the fact that for any \emph{fixed} true
error rate, the distribution of the observed accuracy behaves like
a gaussian \emph{asymptotically.} Here, asymptotically means {}``in
the limit as the number of test examples goes to infinity''.

%
\begin{figure}
\begin{center}\includegraphics[%
  width=0.90\columnwidth,
  angle=270]{test_vs_two_sigma.ps}\end{center}


\caption{\label{fig-two-sigma-test}This is a graph of the confidence intervals
implied by the holdout bound (\ref{th-hb}) on the left, and the approximate
confidence intervals implied using the common two sigma rule motivated
by asymptotic normality on the right. The upper bounds of the holdout
bound have $\delta=0.025$ failure rate, so as to be comparable with
the $2$-sigma approach. The holdout bound is better behaved as the
confidence interval is confined to the interval $[0,1]$ and is never
over-optimistic. }
\end{figure}
The problem with this approach is that it leads to fundamentally misleading
results as shown in figure \ref{fig-two-sigma-test}. This {}``misleading''
is both pessimistic and (much worse) optimistic. The pessimism can
be seen by intervals with boundaries less than $0$ or greater than
$1$ and the optimism by observing what happens when the observed
error rate is $0$. When we observe perfect classification, our confidence
interval should \emph{not} have size $0$ for any finite $m$. 

The basic problem with this approach is that the Binomial distribution
is not similar to a gaussian when the error rate is near $0$. Since
our goal is finding a classifier with a small true error, it is essential
that the means we use to evaluate classifiers work in this regime.
The test set bound can satisfy this requirement (and, in fact, operates
well for all true error regimes). 

\begin{enumerate}
\item The test set bound approach is \emph{never} optimistic. 
\item The test set bound based confidence interval always returns an upper
and lower bound in $[0,1]$.
\end{enumerate}
The $2\hat{\sigma}$ method is a relic of times when computational
effort was expensive. It is now simple and easy to calculate a bound
based upon the cumulative distribution of the Binomial (see \cite{bound}
for a program which does this). 

The test set bound can be thought of as a game where a {}``Learner''
attempts to convince a reasonable {}``Verifier'' of the amount of
learning which has occurred. Pictorially we can represent this as
in figure \ref{fig-holdout-protocol}.%
\begin{figure}
\begin{center}\includegraphics[%
  width=0.90\columnwidth]{test_set.eps}\end{center}


\caption{\label{fig-holdout-protocol} For this diagram {}``increasing time''
is pointing downwards. The only requirement for applying this bound
is that the learner must commit to a classifier without knowledge
of the test examples. A similar diagram for train set bounds is presented
later (and is somewhat more complicated). We can think of the bound
as a technique by which the {}``Learner'' can convince the {}``Verifier''
that learning has occurred (and the degree to which it has occurred).
Each of the proofs can be thought of as a communication protocol for
an interactive proof of learning by the Learner.}
\end{figure}
 

\section{The Occam's Razor Bound}

\label{sec-train}

Given that the bounds for the simple holdout technique work well,
why do we need to engage in further work? There is one serious drawback
to the holdout technique---application of the holdout technique requires
$m_{\textrm{test}}$ otherwise unused examples. This can strongly
degrade the value of the learned hypothesis because an extra $m_{\textrm{test}}$
examples for the training set increases the true error of the learned
hypothesis from $0$ to $0.5$ for some natural learning algorithm/learning
problem pairs. 

There is another reason why training set based bounds are important.
Many learning algorithms implicitly assume that the train set accuracy
{}``behaves like'' the true error in choosing the hypothesis. With
an inadequate number of training examples, there may be very little
relationship between the behavior of the train set accuracy and the
true error. Training set based bounds can be used \emph{in} the training
algorithm and can provide insight into the learning problem itself. 

This section is organized into three subsections.

\begin{enumerate}
\item Subsection \ref{subsec-train-set-upper} states and proves the Occam's
Razor bound.
\item Subsection \ref{sec-lower_upper} proves that the Occam's Razor bound
can not be improved in general. 
\item Subsection \ref{subsec-train-set-practice} discusses implications
of the Occam's Razor bound and shows results for it's application. 
\end{enumerate}

\subsection{The Occam's Razor bound}

\label{subsec-train-set-upper}

This bound in more approximate forms has appeared elsewhere \cite{BEHW}\cite{PB}.
We use {}``prior'' (with quotes) here because it is an arbitrary
probability distribution over classifiers and not necessarily a Bayesian
prior. The distinction is important, because the theory holds regardless
of whether or not a Bayesian prior is used. 

\begin{thm}
\label{th-ORB} (Occam's Razor Bound) For all {}``priors'' $P(c)$
over the classifiers, $c$, for all $\delta\in(0,1]$:\[
\Pr_{S\sim D^{m}}\left(\exists c:\,\, c_{D}\leq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)\geq1-\delta\]

\end{thm}
The application of the Occam's Razor bound is somewhat more complicated
than the application of the test set bound. Pictorially, the protocol
for bound application is given in figure \ref{fig-training-set}.
It is very important to notice that the {}``prior'' $P(c)$ must
be selected \emph{before} seeing the training examples.

%
\begin{figure}
\begin{center}\includegraphics[%
  width=0.90\columnwidth]{training_set.eps}\end{center}


\caption{\label{fig-training-set} In order to apply the train set bound it
is necessary that the choice of {}``prior'' be made before seeing
any training examples. Then, the bound is calculated based upon the
chosen classifier. Note that it \emph{is} {}``legal'' to chose the
classifier based upon the prior $P(c)$ as well as the empirical error
$\hat{c}_{S}$. }
\end{figure}


\begin{proof}
(pictorially in figure \ref{fig-train-proof}) The proof starts with
the test set bound:\[
\forall c\,\,\,\Pr_{S\sim D^{m}}\left(c_{D}\leq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)\geq1-\delta p(c)\]
Negating this statement, we get:\[
\forall c\,\,\,\Pr_{S\sim D^{m}}\left(c_{D}>\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)<\delta p(c)\]
then, we apply the union bound in a nonuniform manner. The union bound
says that $\Pr(A\textrm{ or }B)\leq\Pr(A)+\Pr(B)$. Applying the union
bound to every classifier with a positive measure gives a total probability
of failure of\[
\sum_{c}\delta P(c)=\delta\sum_{c}P(c)=\delta\]
which implies\[
\Pr_{S\sim D^{m}}\left(\exists c:\,\,\, c_{D}>\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)<\delta\]
Negating this again completes the proof. 
\end{proof}
%
\begin{figure}
\includegraphics[%
  width=0.30\columnwidth,
  angle=270]{nonuniform_binomials.ps}\includegraphics[%
  width=0.30\columnwidth,
  angle=270]{deep_cut_nonuniform_binomials.ps}

\includegraphics[%
  width=0.30\columnwidth,
  angle=270]{deep_cut_consistent_binomials.ps} \includegraphics[%
  width=0.30\columnwidth,
  angle=270]{deep_cut_bound.ps}


\caption{\label{fig-train-proof} The sequence of pictures is the pictorial
representation of the proof. The first figure shows a set of classifiers,
each with a tail cut of some varying depth. The second picture shows
an observed training error and the possible Binomial distributions
for a chosen classifier. The third picture shows the true errors which
are consistent with the observation and the tail cuts. The fourth
picture shows the true error bound. }
\end{figure}


\subsubsection{Occam's Razor Corollaries}

Just as with the test set bound, we can relax the Occam's Razor bound
(theorem \ref{th-ORB}) with the Hoeffding approximation to get a
somewhat more tractable expression. 

\begin{cor}
\label{th-horb} (Hoeffding Occam's Razor Bound) For all {}``priors''
$P(c)$ over classifiers, for all $\delta\in(0,1]$:\[
\Pr_{S\sim D^{m}}\left(\exists c:\,\, c_{D}\leq\hat{c}_{S}-\sqrt{\frac{\ln\frac{1}{P(c)}+\ln\frac{1}{\delta}}{2m}}\right)\geq1-\delta\]

\end{cor}
\begin{proof}
approximate the Binomial tail with the Hoeffding inequality. 
\end{proof}
Many people are more familiar with a degenerate form of this bound
where $P(c)=\frac{1}{|H|}$ and $H$ is some set of classifiers. In
that case, simply replace $\ln\frac{1}{P(c)}$ with $\ln|H|$. The
form presented here is chosen is both more general and necessary if
the train set bound is to be used in practice. 

Other corollaries as in section \ref{sub:Corollaries} exist for the
Occam's Razor bound. In general, just substitute $\delta\rightarrow\delta P(c)$.


\subsubsection{Occam's Razor Lower bound}

Just as for the test set bound, a lower bound of the same form applies.

\begin{thm}
\label{th-lorb} (Occam's Razor Lower Bound) For all {}``priors''
$P(c)$ over the classifiers, $c$, for all $\delta\in(0,1]$:\[
\Pr_{S\sim D^{m}}\left(\forall c:\,\, c_{D}\geq\min_{p}\{ p:\,\,1-\textrm{Bin}\left(\hat{c}_{S},p\right)\geq\delta P(c)\}\right)\geq1-\delta\]

\end{thm}
\begin{example*}
(continued) Suppose that instead of having $100$ test examples, we
had $100$ train examples. Also suppose that before seeing the train
examples, we committed to $P(c)=0.1$ for $c$ the constant classifier
which predicts {}``no rain''. Then, the Hoeffding approximations
of the upper and lower bound give the interval, $c_{D}\in[0.22,0.54]$.
With an exact calculation, we get $c_{D}\in[0.26,0.51]$.
\end{example*}

\subsubsection{The state of the art}

A very large amount of work has been done on train set bounds. In
addition to those included here, there is: 

\begin{enumerate}
\item Reinterpretations into the sample complexity model of uniform convergence
\cite{Vapnik} results for continuously parameterized classifiers.
\item Reinterpretations of PAC convergence \cite{Valiant} results.
\item Shell bounds \cite{Shell} which take advantage of the distribution
of true error rates on classifiers. 
\item Train and Test bounds \cite{TnT} which combine train set and test
set bounds. 
\end{enumerate}
Of this large amount of work only a small fraction has been shown
to be useful on real-world learning algorithm/learning problem pairs.
The looseness of train set based bounds often precludes analytical
use. 


\subsection{The Occam's Razor Bound is sometimes Tight}

\label{sec-lower_upper}

The question of tightness for train set bounds is important to address,
as many of them have been extremely loose. The simplest method to
address this tightness is constructive: exhibit a learning problem/algorithm
pair for which the bound is almost achieved. For the test set bound,
this is trivial as any classifier with a large enough true error will
achieve the bound. For the train set bound, this is not so trivial.

How tight is the Occam's Razor bound (\ref{th-ORB})? The answer is
\emph{sometimes} tight. In particular, we can exhibit a set of learning
problems where the Occam's Razor bound can not be made significantly
tighter as a function of the observables, $m$, $\delta$, $p(c)$,
and $\hat{e}(c)$. After fixing the value of these quantities we construct
a learning problem exhibiting this near equivalence to the Occam's
Razor bound.

\begin{thm}
\label{th-dhlub}(Occam's Razor tightness) For all $P(c)$, $\frac{k}{m}$,
$\delta$ there exists a learning problem and algorithm such that:
\[
\Pr_{S\sim D^{m}}\left(\exists c:\,\, c_{D}\geq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)\geq\delta-\delta^{2}\]
 Furthermore, if $c^{*}$ is the classifier with maximal training
accuracy, then: \[
\Pr_{S\sim D^{m}}\left(c_{D}^{*}\geq\overline{\textrm{Bin}}\left(\hat{c}_{S}^{*},\delta P(c)\right)\right)\geq\delta-\delta^{2}\]

\end{thm}
Intuitively, this theorem implies that we can not improve significantly
on the the Occam's Razor bound (theorem \ref{th-ORB}) without using
extra information about our learning problem. 

\begin{proof}
The proof is constructive: we create a learning problem on which large
deviations are likely. We start with a prior $P(c)$, probability
of error $\delta$, and $m$, and a targeted empirical error rate,
$\frac{k}{m}$. For succinctness we assume that $P(c)$ has support
on a finite set of size $n$. 

To define the learning problem, let: $X=\{0,1\}^{n}$ and $Y=\{0,1\}$. 

The distribution $D$ can be drawn by first selecting $Y$ with a
single unbiased coin flip, and then choosing the $i$th component
of the vector $X$ independently, $\Pr((X_{1},...,X_{n})|Y)=\Pi_{i=1}^{n}\Pr(X_{i}|Y)$
. The individual components are chosen so $\Pr(X_{i}=Y|Y)=\overline{\textrm{Bin}}\left(\frac{k}{m},\delta P(c)\right)$. 

The classifiers we consider just use one feature to make their classification:
$c_{i}(x)=x_{i}$. The true error of these classifiers is given by:
$e_{D}(c_{i})=\overline{\textrm{Bin}}\left(\frac{k}{m},\delta P(c)\right)$. 

This particular choice of true errors implies that if any classifier
has a too-small train error rate, then the classifier with minimal
train error must have a too-small train error. 

Using this learning problem, we know that:\[
\forall c,\forall\delta\in(0,1]:\,\,\Pr_{S\sim D^{m}}\left(c_{D}\geq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)=\delta P(c)\]
(negation)\[
\Rightarrow\forall c,\forall\delta\in(0,1]:\,\,\Pr_{S\sim D^{m}}\left(c_{D}<\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)=1-\delta P(c)\]
(independence)\[
\Rightarrow\forall\delta\in(0,1]:\,\,\Pr_{S\sim D^{m}}\left(\forall c\,\, c_{D}<\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)<\prod_{c}\left(1-\delta P(c)\right)\]
(negation)\[
\Rightarrow\forall\delta\in(0,1]:\,\,\Pr_{S\sim D^{m}}\left(\exists c\,\, c_{D}\geq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)\]
\[
=\sum_{i=1}^{n}\delta P(c_{i})\left(1-\Pr_{S\sim D^{m}}\left(\exists c\in\{ c_{1},...,c_{i-1}\}\,\,\, c_{D}\geq\overline{\textrm{Bin}}\left(\hat{c}_{S},\delta P(c)\right)\right)\right)\]
\[
\geq\sum_{i=1}^{n}\delta P(c_{i})(1-\delta)\]
\[
=\delta-\delta^{2}\]

\end{proof}
The lower bound theorem implies that we can not improve an Occam's
Razor like statement. However, it is important to note that large
improvements are possible if we use other sources of information.
To see this, just note the case where every single classifier happens
to be the same. In this case the {}``right'' bound would the be
the \emph{test} set bound, rather than the train set bound. The PAC-Bayes
bound and the Sample Compression bound presented in the next sections
use other sources of information.


\subsection{Train set bound implications }

\label{subsec-train-set-practice}


\subsubsection{results}

%
\begin{figure}
\begin{center}\includegraphics[%
  width=0.60\columnwidth,
  angle=270]{test_vs_micro.ps}\end{center}


\caption{\label{fig-microchoice-holdout}This is a plot comparing confidence
intervals built based upon the holdout bound (\ref{th-hb}) with an
80\%/20\% train/test split on the left and the Occam's Razor bound
(\ref{th-ORB}) with all data in the training set on the right. The
Occam's razor bound is sometimes superior on the smaller data sets
and always nonvacuous (in contrast to many other train set bounds). }
\end{figure}


The Occam's Razor bound is strongly related to compression. In particular,
for any self-terminating description language, $d(c)$, we can associate
a {}``prior'' $P(c)=2^{-|d(c)|}$ with the property that $\sum_{c}P(c)\leq1$.
Consequently, short description length hypotheses tends to have a
tighter convergence and the penalty term, $\ln\frac{1}{P(c)}$ is
the number of {}``nats'' (bits base e). For any language fixed before
seeing the train set, classifiers with shorter description lengths
have tighter bounds on the true error rate. One particularly useful
description language to consider is the execution trace of a learning
algorithm. If we carefully note the sequences of data-dependent choices
which a learning algorithm makes, then the output classifier can be
specified by a sequence such as {}``2nd choice, third choice, first
choice, etc...'' This is the idea behind microchoice bounds \cite{MC_journal}.
Results for this approach are reported in Figure \ref{fig-microchoice-holdout}
and are strong enough to act as an empirical existence proof that
Occam's Razor bounds can be made tight enough for useful application. 


\subsubsection{Learning algorithm design}

\emph{Every} train set bound implies a learning algorithm: choose
the classifier which minimizes the true error bound. This sounds like
a rich source of learning algorithms, but there are some severe caveats
to that statement. 

\begin{enumerate}
\item It is important to note that the form of a train set bound does \emph{not}
imply that this minimization is a good idea. Choosing between two
classifiers based upon their true error bound implies a better worst-case
bound on the true error. It does not imply an improved true error.
In many sitations, there is some other metric of comparison (such
as train error rate) which in fact creates better behavior in the
typical case. 
\item Another strong caveat is that, historically, train set bounds have
simply not been tight enough on real datasets for a nonvacuous application.
This is changing with new results, but more progress is necessary.
\item Often the optimization problem is simply not very tractable. In addition
to sample complexity, learning algorithms must be concerned with run
time and space usage. 
\end{enumerate}

\subsubsection{Philosophy}

Train set bounds teach us about ways in which verifiable learning
is possible, a subject which borders on philosophy. The train set
bound presented here essentially shows that a reasonable person will
be convinced of learning success when a short-description classifier
does well on train set data. The results here do \emph{not} imply
that this is the only way to convincingly learn. In fact, the (sometimes
large) looseness of the Occam's Razor bound suggests that other methods
for convincing learning processes exist. This observation is partially
shown by other train set bound results which are presented next.


\section{PAC-Bayes Bound}

\label{sec:PAC-Bayes-Bound}

The PAC-Bayes bound\cite{PB} is particularly exciting because it
can provide quantitatively useful results for classifiers with \emph{real
valued} parameters. This includes such commonly used classifiers as
Support Vector Machines and Neural Networks.%
\footnote{There is a caveat here---the bound only applies to stochastic versions
of the classifiers. However, the probability that the stochastic classifier
differs from the classifier can be made very small.%
} This section is divided into three parts:

\begin{enumerate}
\item Subsection \ref{sub:The-PAC-Bayes-Bound} States and proves the PAC-Bayes
Bound.
\item Subsection \ref{sub:How-Tight-is} shows that the PAC-Bayes Bound
is nearly as tight as possible given the observations.
\item Subsection \ref{sub:Application-of-the} discusses results from the
application of the PAC-Bayes bound to support vector machines.
\end{enumerate}

\subsection{The PAC-Bayes Bound}

\label{sub:The-PAC-Bayes-Bound}

The PAC-Bayes bound has been improved by tightening \cite{averaging}
and then with a much simpler proof \cite{Matthias} since it was originally
stated. The statement and proof presented here incorporate these improvements
and improve on them slightly.

The PAC-Bayes bound is dependent upon two derived quantities, an average
true error:\[
Q_{D}\equiv E_{c\sim Q}c_{D}\]
and an average train error:\[
\hat{Q}_{S}\equiv E_{c\sim Q}\hat{c}_{S}\]
These quantities can be interpreted as the train error and true error
of the metaclassifier which chooses a classifier according to $Q$
every time a classification is made. If we refer to this metaclassifier
as $Q$, the notation for error rates is consistent with our earlier
notation.

\begin{thm}
(PAC-Bayes Bound) For all {}``priors'' $P(c)$ over the classifiers,
$c$, for all $\delta\in(0,1]$:\[
\Pr_{S\sim D^{m}}\left(\forall Q(c):\,\,\textrm{KL}\left(\hat{Q}_{S}||Q_{D}\right)\leq\frac{\textrm{KL}(Q||P)+\ln\frac{m+1}{\delta}}{m}\right)\geq1-\delta\]
Here $\textrm{KL}(q||p)=q\ln\frac{q}{p}+(1-q)\ln\frac{1-q}{1-p}$
for $q<p$ and $\textrm{KL}(Q||P)=E_{c\sim Q}\ln\frac{Q(c)}{P(c)}$
is the Kullback-Leibler divergence between two distributions.
\end{thm}
Note that the PAC-Bayes bound applies to any \emph{distribution} over
classifiers. When $Q$ is concentrated on one classifier, we have
$\textrm{KL}(Q||P)=\ln\frac{1}{P(c)}$, just as in the Occam's razor
bound, with the only distinction being the additive $\frac{\ln(m+1)}{m}$
term. It is somewhat surprising that the bound holds for \emph{every}
distribution $Q$ with only the slight worsening by $\frac{\ln(m+1)}{m}$.

Since the KL-divergence applies to distributions over continuous valued
parameters, the PAC-Bayes bound can be nontrivially tight in this
setting as well. This fact is used in the application section.

We first state a simple lemma that is handy in the proof. 

\begin{lem}
For all $P(c)$, For all $\delta\in(0,1]$:\[
\Pr_{S\sim D^{m}}\left(E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}\leq\frac{m+1}{\delta}\right)\geq1-\delta\]

\end{lem}
\begin{proof}
Note that:\[
\forall c\,\,\, E_{S\sim D^{m}}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}=\sum_{\frac{k}{m}}\Pr_{S\sim D^{m}}\left(\hat{c}_{S}=\frac{k}{m}\right)\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}=\frac{k}{m}\right)}\]
\[
=m+1\]
Taking the expectation over classifiers according to $P$ and switching
the order of expectation, we get:

\[
E_{S\sim D^{m}}E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}=m+1\]
and using the Markov inequality, we get:

\[
\forall P\,\,\Pr_{S\sim D^{m}}\left(E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}\leq\frac{m+1}{\delta}\right)\leq\delta\]

\end{proof}
With this handy lemma out of the way, we can easily prove the main
theorem.

\begin{proof}
(Of the PAC-Bayes theorem) Let $P_{G}(c)=\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}}P(c)$
then:\[
0\leq\textrm{KL}(Q||P_{G})=E_{c\sim Q}\ln\frac{Q(c)}{P(c)}\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}\]
\[
=\textrm{KL}(Q||P)-E_{c\sim Q}\ln\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}+\ln E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}\]
which implies:\[
E_{c\sim Q}\ln\frac{1}{\left(\begin{array}{c}
m\\
m\hat{c}_{S}\end{array}\right)\left(c_{D}\right)^{m\hat{c}_{S}}\left(1-c_{D}\right)^{m(1-\hat{c}_{S})}}\leq\textrm{KL}(Q||P)+\ln E_{c\sim P}\frac{1}{\Pr_{S\sim D^{m}}\left(\hat{c}_{S}\right)}\]
The lemma bounds the last term, so with probability $1-\delta$, we
have:

\[
E_{c\sim Q}\left[\hat{c}_{S}\ln\frac{1}{c_{D}}+\left(1-\hat{c}_{S}\right)\ln\frac{1}{1-c_{D}}\right]-\frac{E_{c\sim Q}\ln\left(\begin{array}{c}
m\\
m\hat{e}_{c}\end{array}\right)}{m}\leq\frac{\textrm{KL}(Q||P)+\frac{m+1}{\delta}}{m}\]
Using the concavity of $f(x)=\ln\frac{1}{x}$ and Jensen's inequality,
we get:\[
\hat{Q}_{S}\ln\frac{1}{Q_{D}}+\left(1-\hat{Q}_{S}\right)\ln\frac{1}{1-Q_{D}}\leq E_{c\sim Q}\left[\hat{c}_{S}\ln\frac{1}{c_{D}}+\left(1-\hat{c}_{S}\right)\ln\frac{1}{1-c_{D}}\right]\]
\[
\frac{E_{c\sim Q}\ln\left(\begin{array}{c}
m\\
m\hat{c}_{S}\end{array}\right)}{m}\leq\frac{E_{c\sim Q}\ln e^{mH\left(\hat{c}_{S}\right)}}{m}=\frac{E_{c\sim Q}mH\left(\hat{c}_{S}\right)}{m}\leq H\left(\hat{Q}_{S}\right)\]
Using all of these inequalities together, we get with probability
$1-\delta$:\[
\hat{Q}_{S}\ln\frac{1}{Q_{D}}+\left(1-\hat{Q}_{S}\right)\ln\frac{1}{1-Q_{D}}-H\left(\hat{Q}_{S}\right)\leq\frac{\textrm{KL}(Q||P)+\frac{m+1}{\delta}}{m}\]
which proves the theorem.
\end{proof}

\subsection{The PAC-Bayes Bound is sometimes Tight}

\label{sub:How-Tight-is}

Since the PAC-Bayes bound is (almost) a generalization of the Occam's
Razor bound, the tightness result for Occam's Razor also applies to
PAC-Bayes bounds. This answer is not satisfying, however, because
we want a tightness result which holds for arbitrary values of $\textrm{KL}(Q||P)$.
This is what the next theorem gives us.

\begin{thm}
\label{th-PB-tight}(PAC-Bayes Tightness) For all $P(c)$, $\textrm{KL}(Q||P)$,
$\frac{k}{m}$, $\delta$ there exists a learning problem and algorithm
such that: \[
\Pr_{S\sim D^{m}}\left(\exists Q(c):\,\,\textrm{KL}\left(\hat{Q}_{S}||Q_{D}\right)>\frac{\textrm{KL}(Q||P)+\ln\frac{1}{m\delta}}{m}\right)\geq\delta-\delta^{2}\]

\end{thm}
\begin{proof}
The proof is exactly the same as for the Occam's Razor tightness result
(theorem \ref{th-dhlub}), except with a few twists. 
\end{proof}

\subsection{Application of the PAC-Bayes Bound}

\label{sub:Application-of-the}

Actually applying the PAC-Bayes bound requires some specialization
\cite{PB-margin}. Here, we specialize to classifiers of the form:
\[
c(x)=\textrm{sign}\left(\vec{w}\cdot\vec{x}\right)\]
Note that via the kernel trick, Support Vector Machines also have
this form. 

The specialization is naturally expressed in terms of a few derived
quantities:

\begin{enumerate}
\item The cumulative distribution of a Gaussian. Let $\bar{F}(x)=\int_{x}^{\infty}\frac{1}{\sqrt{2\pi}}e^{-x^{2}/2}$.
Here we use $\bar{F}$ rather than $F$ to denote the fact that we
integrate from $\infty$ to $x$ rather than $-\infty$ to $x$. 
\item A {}``posterior'' distribution $Q(\vec{w},\mu)$ which is $N(\mu,1)$
in the direction of $\vec{w}$ and $N(0,1)$ in all perpendicular
directions.
\item The normalized margin of the examples\[
\gamma(\vec{x},y)=\frac{y\vec{w}\cdot\vec{x}}{||\vec{w}||||\vec{x}||}\]

\item A stochastic error rate, $\hat{Q}(\vec{w},\mu)_{S}=E_{\vec{x},y\sim U(S)}\bar{F}\left(\mu\gamma(\vec{x},y)\right)$
\end{enumerate}
This last quantity in particular is very important to understand.
Consider the case as $\mu$ approaches $\infty$. When the margin
is negative (indicating an incorrect classification), $\bar{F}\left(\mu\gamma(\vec{x},y)\right)$
approaches $1$. When the margin is positive $\bar{F}\left(\mu\gamma(\vec{x},y)\right)$
approaches $0$. Thus, $\epsilon_{S,\mu}$ is a softened form of the
empirical error $\hat{c}_{S}$ which takes into account the margin. 

\begin{cor}
(PAC-Bayes Margin Bound) For all distributions $D$, for all $\delta\in(0,1]$,
we have: \[
\Pr_{S\sim D^{m}}\left(\forall\vec{w},\mu:\,\,\textrm{KL}\left(\hat{Q}(\vec{w},\mu)_{S}||Q(\vec{w},\mu)_{D}\right)\leq\frac{\frac{\mu^{2}}{2}+\ln\frac{m+1}{\delta}}{m}\right)\geq1-\delta\]

\end{cor}
\begin{proof}
The proof is very simple. We just choose the prior $P=N(0,1)^{n}$
and work out the implications. 

Since the Gaussian distribution is the same in every direction, we
can reorient the coordinate system of the prior to have one dimension
parallel to $w$. Since the draws in the parallel and perpendicular
directions are independent, we have:\[
\textrm{KL}(Q||P)=\textrm{KL}(Q_{\perp}||P_{\perp})+\textrm{KL}(N(\mu,1)||N(0,1))\]
\[
=\frac{\mu^{2}}{2}\]
as required.

All that remains is calculating the stochastic error rate $\hat{Q}(\vec{w},\mu)_{S}$.
Fix a particular example, $(\vec{x},y)$. This example has a natural
decomposition $\vec{x}=\vec{x}_{||}+\vec{x}_{\perp}$ into a component,
$\vec{x}_{||}$ parallel to the weight vector $\vec{w}$ and a component
$\vec{x}_{\perp}$ perpendicular to the weight vector. 

To classify, we draw weight vector $\vec{w}^{'}$ from $\hat{Q}(\vec{w},\mu)$.
This $\vec{w}^{'}$ consists of $3$ components, $\vec{w}^{'}=\vec{w}_{||}^{'}+\vec{w}_{\perp}^{'}+\vec{w}_{\perp\perp}^{'}$.
Here $\vec{w}_{||}^{'}\sim N(\mu,1)$ is parallel to the original
weight vector, $\vec{w}_{\perp}^{'}\sim N(0,1)$ which is parallel
to $\vec{x}_{\perp}$ and $\vec{w}_{\perp\perp}^{'}$ is perpendicular
to both $\vec{w}$ and $\vec{x}$. We have:\[
E_{\vec{x},y\sim U(S),\vec{w}^{'}\sim Q(\vec{w},\mu)}I\left(y\neq\textrm{sign}\left(\vec{w}^{'}\cdot\vec{x}\right)\right)\]
\[
=E_{\vec{x},y\sim U(S),\vec{w}^{'}\sim Q(\vec{w},\mu)}I\left(y\vec{w}\cdot\vec{x}\leq0\right)\]
 If we let $w_{||}^{'}=||w_{||}^{'}||$, $w_{\perp}^{'}=||w_{\perp}^{'}||$,
$x_{||}=||\vec{x}_{||}||$, and $x_{\perp}=||\vec{x}_{\perp}||$,
and assume (without loss of generality) that $y=1$ we get: \[
=E_{\vec{x},y\sim U(S),w_{||}^{'}\sim N(\mu,1),w_{\perp}^{'}\sim N(0,1)}I\left(y(w_{||}^{'}x_{||}+w_{\perp}^{'}x_{\perp})\leq0\right)\]
\[
=E_{\vec{x},y\sim U(S)}E_{w_{||}^{'}\sim N(\mu,1)}E_{w_{\perp}^{'}\sim N(0,1)}I\left(y(w_{||}^{'}x_{||}+w_{\perp}^{'}x_{\perp})\leq0\right)\]
\[
=E_{\vec{x},y\sim U(S)}E_{w_{||}^{'}\sim N(0,1)}E_{w_{\perp}^{'}\sim N(0,1)}I\left(y\mu\leq-yw_{||}^{'}-yw_{\perp}^{'}\frac{x_{\perp}}{x_{||}}\right)\]
Using the symmetricity of the Gaussian, this is:\[
=E_{\vec{x},y\sim U(S)}E_{w_{||}^{'}\sim N(0,1)}E_{w_{\perp}^{'}\sim N(0,1)}I\left(y\mu\leq yw_{||}^{'}+yw_{\perp}^{'}\frac{x_{\perp}}{x_{||}}\right)\]
Using the fact that the sum of two Guassians is a Gaussian:\[
=E_{\vec{x},y\sim U(S)}E_{v\sim N\left(0,1+\frac{x_{\perp}^{2}}{x_{||}^{2}}\right)(0,1)}I\left(y\mu\leq yv\right)\]
\[
=E_{\vec{x},y\sim U(S)}E_{v\sim N\left(0,\frac{1}{\gamma(\vec{x},y)^{2}}\right)(0,1)}I\left(y\mu\leq yv\right)\]
\[
=E_{\vec{x},y\sim U(S)}\bar{F}\left(\mu\gamma(\vec{x},y)\right)\]
finishing the proof.
\end{proof}
Using the corollary, the true error bound $\bar{Q}(\vec{w},\mu)_{D}$
satisfies the equation: \[
\textrm{KL}\left(\hat{Q}(\vec{w},\mu)_{S}||\bar{Q}(\vec{w},\mu)_{D}\right)=\frac{\frac{\mu^{2}}{2}+\ln\frac{m+1}{\delta}}{m}\]
This is an implicit equation for $\bar{Q}$ which can be easily solved
numerically. 

The bound is stated in terms of dot products here, so naturally it
is possible to kernelize the result using methods from \cite{kernelize}.
In kernelized form, the bound applies to classifiers of the form:
\begin{equation}
c(x)=\textrm{sign}\left(\sum_{i=1}^{m}\alpha_{i}k(x_{i},x)\right)\label{eq:class-kernel}\end{equation}
Since, by assumption, $k$ is a kernel, we know that $k(x_{i},x)=\vec{\Phi}(x_{i})\cdot\vec{\Phi}(x)$
where $\vec{\Phi}(x)$ is some projection into another space. In kernelized
form, we get $\vec{w}\cdot\vec{x}=\sum_{i=1}^{m}\alpha_{i}k(x_{i},x)$,
$\vec{x}\cdot\vec{x}=k(x,x)$, $\vec{w}\cdot\vec{w}=\sum_{i,j}\alpha_{i}\alpha_{j}k(x_{i},x_{j})$,
defining all of the necessary quantities to calculate the normalized
margin, \[
\gamma(x,y)=\frac{\sum_{i=1}^{m}\alpha_{i}k(x_{i},x)}{\sqrt{k(x,x)\sum_{i,j=1,1}^{m,m}\alpha_{i}\alpha_{j}k(x_{i},x_{j})}}\]
 

One element remains, which is the value of $\mu$. Unfortunately the
bound can be nonmonotonic in the value of $\mu$, but it turns out
that for classifiers learned by support vector machines on reasonable
datasets, there is only one value of $\mu$ which is (locally, and
thus globally) minimal. A binary search over some reasonable range
of $\mu$ (say from $1$ to $100$) can find the minima quickly, given
the precomputation of the margins. It is worth noting again here that
we are not {}``cheating''---the bound holds for all values of $\mu$
simultaneously. 

The computational time of the bound calculation is dominated by the
calculation of the margins which is $O\left(m^{2}\right)$ where $m$
is the number of support vectors with a nonzero associated $\alpha$.
This computational time is typically dominated by the time of the
SVM learning algorithm. 


\subsubsection{Results}

Application of this bound to support vector machines is of significant
importance because SVMs are reasonably effective and adaptable classifiers
in common and widespread use. A SVM learns a kernelized classifier
as per equation \ref{eq:class-kernel}%
\footnote{Some SVM learning algorithms actually learn a classifier of the form:
$c(x)=\textrm{sign}\left(b+\sum_{i=1}^{m}\alpha_{i}k(x_{i},x)\right)$.
We don't handle this form here.%
}. 

We apply the support vector machine to 8 UCI database problems chosen
to fit the criteria {}``two classes'' and {}``real valued input
features''. The problems vary in size over an order of magnitude
from $145$ to $1428$ examples. In figure \ref{fig:test} we use
a 70/30 train/test split of the data. 

In all experiments, we use SVMlight with a gaussian kernel and the
default bandwidth. Results for other choices of the {}``C'', bandwidth,
and/or kernel appear to be qualitatively similar (although of course
differing quantitatively). 

%
\begin{figure}
\includegraphics[%
  width=0.60\columnwidth,
  angle=270]{test_and_margin.ps}


\caption{\label{fig:test}}

This figure shows the results of applying SVMlight to 8 datasets with
a Gaussian kernel and a 70/30 train/test split. The observed test
error rate is graphed as an X. On the test set, we calculate a Binomial
confidence interval (probability of bound failure = $0.01$) which
upper bounds the true error rate. On the training set we calculate
the PAC-Bayes margin bound for an optimized choice of $\mu$. 
\end{figure}
It is important to note that the PAC-Bayes margin bound is \emph{not}
precisely a bound (or confidence interval) on the true error rate
of the learned classifier. Instead, it is a true error rate bound
on an associated stochastic classifier chosen so as to have a similar
test error rate. These bounds can be regarded as bounds for the original
classifier only under an additional assumption: that picking a classifier
according to the majority vote of this stochastic distribution does
not worsen the true error rate. This is not true in general, but may
be true in practice. 

It is of course unfair to compare the train set bound with the test
set bound on a 70/30 train/test split because a very tight train set
bound would imply that it is unnecessary to even have a test set.
In figure \ref{fig:full-train} we compare the true error bounds on
all of the data to the true error bounds generated from the 70/30
train/test split.

%
\begin{figure}
\includegraphics[%
  width=0.60\columnwidth,
  angle=270]{test_vs_margin.ps}


\caption{\label{fig:full-train}}

In addition to comparing with everything in figure \ref{fig:test},
we graph the margin bound when \emph{all} of the data is used for
the train set. Note that it improves somewhat on the margin bound
calculated using the 70\% train set (7/10 margin bound), but not enough
to compete with the test set bound. 
\end{figure}
The results show that the PAC-Bayes margin bound is tight enough to
give useful information, but still not competitive with the test set
bounds. This is in strong contrast with a tradition of quantitatively
impractical margin bounds. There are several uses available for bounds
which provide some information but which are not fully tight. 

\begin{enumerate}
\item They might be combined with a train/test bound \cite{TnT}. 
\item The train set bound might easily become tighter for smaller sample
sizes. This was observed in \cite{TnT}.
\end{enumerate}
Missing: how well does the bound work for parameter selection?


\section{Sample Compression Bound}

\label{sec:Sparsity-Bound}

The sample compression bound \cite{LW}, \cite{FW} is like the PAC-Bayes
bound in that it applies to arbitrary precision continuous valued
classifiers. Unlike the PAC-Bayes bound, it applies meaningfully to
nonstochastic classifiers. Mainstream learning algorithms do not optimize
the sample compression metric, so the bound application is somewhat
rarer. Nonetheless, there do exist some reasonably competitive learning
algorithms for which the sample compression bound produces significant
results.

The section is organized as follows:

\begin{enumerate}
\item Subsection \ref{sub:The-Sample-Compression-Bound} States and proves
the sample compression bound.
\item Subsection \ref{sub:SC-tightness} shows that the sample compression
bound is nearly as tight as possible given the observations.
\item Subsection \ref{sub:SC-application} discusses results from the application
of the PAC-Bayes bound to support vector machines.
\end{enumerate}

\subsection{The Sample Compression Bound}

\label{sub:The-Sample-Compression-Bound}

The Sample Compression bound stated here is a little bit differs somewhat
from other results by generalization and simplification but the bound
behavior is qualitatively identical.

Suppose we have a learning algorithm, $A(S)$ whose training is {}``sparse''%
\footnote{This is satisfied, for example, by the Support Vector Machine algorithm
which only depends upon the set of support vectors.%
} in the sense that the output classifier is dependent upon only a
subset of the data, $A(S)=A(S')$ for $S'\subseteq S$. The sample
compression bound is dependent on the error rate, $\hat{c}_{S-S'}$
on the subset $S-S'$. The motivation here is that the examples which
the learning algorithm does \emph{not} depend upon are {}``almost''
independent and so we can {}``almost'' get a test set bound.

\begin{thm}
(Sample Compression Bound) For all $\delta\in(0,1]$, $D$, $A$:\[
\Pr_{S\sim D^{m}}\left(\forall S'\subseteq S\,\,\,\textrm{with }c=A(S'):\,\,\, c_{D}\leq\overline{\textrm{Bin}}\left(\hat{c}_{S-S'},\frac{\delta}{m{m \choose |S-S'|}}\right)\right)\geq1-\delta\]

\end{thm}
\begin{proof}
Suppose we knew in advance that the learning algorithm will not depend
upon some subset of the examples. Then, the {}``undependent'' subset
acts like a test set and gives us a test set bound.\[
\forall S'\subseteq S\,\,\,\textrm{with }c=A(S'):\Pr_{S\sim D^{m}}\left(c_{D}\leq\overline{\textrm{Bin}}\left(\hat{c}_{S-S'},\frac{\delta}{m{{m \choose |S-S'|}}}\right)\right)\geq1-\frac{\delta}{m{{m \choose |S-S'|}}}\]
(Note that, technically, it is possible to refer to $S'$ unambiguously
before randomizing over $S$ by specifying the indicies of $S$ contained
in $S'$.) Negating this, we get:

\[
\forall S'\subseteq S\,\,\,\textrm{with }c=A(S'):\Pr_{S\sim D^{m}}\left(c_{D}>\overline{\textrm{Bin}}\left(\hat{c}_{S-S'},\frac{\delta}{m{{m \choose |S-S'|}}}\right)\right)<\frac{\delta}{m{{m \choose |S-S'|}}}\]
and using the union bound ($\Pr(A\textrm{ or }B)\leq\Pr(A)+\Pr(B)$
over each possible subset, $S'$, we get:\[
\Pr_{S\sim D^{m}}\left(\exists S'\subseteq S\,\,\,\textrm{with }c=A(S'):\,\,\, c_{D}>\overline{\textrm{Bin}}\left(\hat{c}_{S-S'},\frac{\delta}{m{{m \choose |S-S'|}}}\right)\right)<\delta\]
Negating this again gives us the proof.
\end{proof}

\subsection{The Sample Compression Bound is Sometimes Tight}

\label{sub:SC-tightness}

We can construct a learning algorithm/learning problem pair such that
the Sample compression bound is provably near optimal, as a function
of it's observables. 

\begin{thm}
(Sample Compression Tightness) For all $\delta\in(0,1]$, $\frac{k}{m}$,
there exists a distribution $D$ and learning algorithm $A$ s.t.\[
\Pr_{S\sim D^{m}}\left(\exists S'\subseteq S\,\,\,\textrm{with }c=A(S'):\,\,\, c_{D}>\overline{\textrm{Bin}}\left(\hat{c}_{S-S'},\frac{\delta}{m{m \choose |S-S'|}}\right)\right)>\delta-\delta^{2}\]
furthermore, if $S^{*}$ minimizes $\overline{\textrm{Bin}}\left(\hat{c}_{S-S'},\frac{\delta}{m{{m \choose |S-S'|}}}\right)$,
then \[
\Pr_{S\sim D^{m}}\left(c^{*}=A(S^{*}):\,\,\, c_{D}^{*}>\overline{\textrm{Bin}}\left(\hat{c}_{S-S^{*}}^{*},\frac{\delta}{m{m \choose |S-S^{*}|}}\right)\right)>\delta-\delta^{2}\]

\end{thm}
\begin{proof}
The proof is constructive and similar to the Occam's Razor tightness
result. In particular, we show how to construct a learning algorithm
which outputs classifiers that err independently depending on the
subset $S'$ used. 

Consider an input space $X=\{0,1\}^{2^{m}}$. Each variable in the
input space $x_{S'}$ can be thought of as indexing a unique subset
$S'\subseteq S$ of the examples. In the rest of the proof, we index
variables by the subset they correspond to.

Draws from the distribution $D$ can be made by first flipping an
unbiased coin to get $y=1$ with probability $0.5$ and $y=-1$ with
probability $0.5$. The distribution on $X$ consists of a set of
independent values after conditioning on $y$. Choose \[
\Pr(x_{S'}\neq y)=\overline{\textrm{Bin}}\left(\frac{k}{m},\frac{\delta}{m{{m \choose |S-S'|}}}\right)\]
Now, the learning algorithm $A(S')$ is very simple---it just outputs
the classifier $c(x)=x_{S'}$. On the set $S-S'$, we have:\[
\forall S'\,\,\,\Pr_{S\sim D^{m}}\left(\hat{c}_{S-S'}\geq\frac{k}{m}\right)=1-\frac{\delta}{m{{m \choose |S-S'|}}}\]
Using independence, we get:\[
\Pr_{S\sim D^{m}}\left(\forall S'\,\,\,\hat{c}_{S-S'}\geq\frac{k}{m}\right)=\prod_{S'}\left(1-\frac{\delta}{m{{m \choose |S-S'|}}}\right)\]
Negating, we get:\[
\Pr_{S\sim D^{m}}\left(\forall S'\,\,\,\hat{c}_{S-S'}<\frac{k}{m}\right)=1-\prod_{S'}\left(1-\frac{\delta}{m{{m \choose |S-S'|}}}\right)\]
and doing some algebra, we get the result.
\end{proof}

\subsection{Application of the Sample Compression Bound}

\label{sub:SC-application}

One obvious application of the Sample Compression bound is to support
vector machines, since the learned classifier is only dependent on
the set of support vectors. If $S'$is the set of support vectors
then $S-S'$ is the set of nonsupport vectors. Unfortunately, it turns
out that this does not work so well, as observed in figure \ref{fig:sc-svm}.
%
\begin{figure}
\includegraphics[%
  width=0.60\columnwidth,
  angle=270]{test_and_compress.ps}


\caption{\label{fig:sc-svm}The sample compression bound applied to the output
of a support vector machine with a Gaussian kernel. Here we use $\delta=0.01$}
\end{figure}


There are other less common learning algorithms for which the sample
compression bound works well. The Set Covering machine \cite{SCM}
has an associated bound which is a variant of the Sample Compression
Bound. 


\section{Conclusion}

This introduction to sample complexity in learning theory covered
two styles of bound: the test set bound and the train set bound. There
are two important lessons here:

\begin{enumerate}
\item Test set bounds provide a better way to report error rates and confidence
intervals on future error rates than some current methods. 
\item Train set bounds can be used practically to drive a learning algorithm. 
\end{enumerate}
It is important to note that the train set bound and test set bound
techniques are not mutually exclusive. It is possible to use both
simultaneously \cite{TnT}, and doing so is often desirable. Test
set bounds are improved by the {}``free'' information about the
training error and train set bounds benefit by widely increased applicability. 


\section*{Appendix}

\label{sec-model}

For those interested in comparing models, the uniform convergence
model \cite{Vapnik} requires the additional assumption of the axiom
of choice (to achieve empirical risk minimization) and a bound on
the hypothesis space complexity. Typical theorems are of the form
{}``after $m$ examples, all training errors are near to true errors''.

The PAC learning model\cite{Valiant} requires a polynomial time complexity
learning algorithm and the assumption that the learning problem comes
from some class. Theorems are of the form {}``after $m$ examples
learning will be achieved''.

Both of these models can support stronger statements than the basic
sample complexity model presented here. Results from both of these
models can apply to the sample complexity model presented here after
appropriate massaging of results. 

The online learning model \cite{Online} makes \emph{no} assumptions.
Typical theorems have the form {}``This learning algorithm's performance
will be nearly as good as anyone of a set of classifiers.'' The online
learning model has very general results and no ability to answer questions
about future performance as we address here.

The sample complexity model can most simply be understood as a slight
refinement of the information theory model. 

\begin{thebibliography}{10}
\bibitem{BEHW}A. Blumer, A. Ehrenfeucht, D. Haussler, M. Warmuth. {}``Occam's Razor.''
Information Processing Letters 24: 377-380, 1987.
\bibitem{Progressive}Avrim Blum, Adam Kalai, and John Langford 1999. Beating the Holdout:
Bounds for K-Fold and Progressive Cross-Validation. COLT99. http://www.cs.cmu.edu/\textasciitilde{}jcl/papers/progressive\_validation/coltfinal.ps
\bibitem{Devroye}Luc Devroye, Laszlo Gyorfi, Gabor Lugosi, {}``A Probabilistic Theory
of Pattern Recognition'' Springer-Verlag New York, 1996.
\bibitem{FW}Sally Floyd and Manfred Warmuth, {}``Sample Compression, Learnability,
and the Vapnik-Chervonenkis Dimension'', Machine Learning, Vol.21
(3), pp. 269--304, December 1995 http://www.cse.ucsc.edu/\textasciitilde{}manfred/pubs/sallycompr.forgalleys.ps
\bibitem{kernelize}R. Herbrich and T. Graepel. {}``Large scale Bayes point machines''
In T. K. Leen, T. G. Dietterich, and V. Tresp, editors, Advances in
Neural Information System Processing 13, pages 528-534, Cambridge,
MA, 2001.
\bibitem{Sanity}Michael Kearns and Dana Ron, {}``Algorithmic Stability and Sanity-Check
Bounds for Leave-One-Out Cross-Validation.'' Neural Computation 11(6),
pages 1427-1453, 1999. Also in Proceedings of the Tenth Annual Conference
on Computational Learning Theory, ACM Press, 1997, pages 152--162.
http://www.cis.upenn.edu/\textasciitilde{}mkearns/papers/multi.ps
\bibitem{Online}J. Kivinen and M. Warmuth, \char`\"{}Additive Versus Exponentiated
Gradient Updates for Linear Prediction,\char`\"{} in Journal of Information
and Computation, vol. 132, no. 1, pp. 1-64, January 1997. http://www.cse.ucsc.edu/\textasciitilde{}manfred/pubs/lin.ps
\bibitem{bound}John Langford, Program {}``bound'', http://www-2.cs.cmu.edu/\textasciitilde{}jcl/programs/bound/bound.html
\bibitem{MC_journal}John Langford and Avrim Blum 1999. Microchoice Bounds and Self Bounding
learning algorithms. Machine Learning Journal. http://www.cs.cmu.edu/\textasciitilde{}jcl/papers/microchoice/journal/journal\_final.ps
\bibitem{Shell}John Langford and David McAllester, {}``Computable Shell Decomposition
Bounds'' COLT 2000. http://www-2.cs.cmu.edu/\textasciitilde{}jcl/papers/computable\_shell/colt\_final.ps
\bibitem{averaging}John Langford and Matthias Seeger, {}``Bounds for Averaging Classifiers''
Technical report, Carnegie Mellon 2001 http://www-2.cs.cmu.edu/\textasciitilde{}jcl/papers/averaging/averaging\_tech.ps
\bibitem{PB-margin}John Langford and John Shawe-Taylor, {}``PAC-Bayes \& Margins'',
NIPS 2002. http://www-2.cs.cmu.edu/\textasciitilde{}jcl/papers/PB-margin/PB-margin.ps
\bibitem{TnT}John Langford, {}``Combining Train Set and Test Set Bounds'', ICML2002.
http://www-2.cs.cmu.edu/\textasciitilde{}jcl/papers/tnt\_icml/train\_n\_test\_final.ps
\bibitem{LW}Nick Littlestone and Manfred Warmuth, {}``Relating Data Compression
and Learnability'' Unpublished Manuscript http://www.cse.ucsc.edu/\textasciitilde{}manfred/pubs/lrnk-olivier.ps
\bibitem{PB}David McAllester, ``PAC-Bayesian Model Averaging'' COLT 1999. http://www.autoreason.com/posterior01.ps
\bibitem{SCM}Mario Marchand and John Shawe-Taylor, {}``The Set Covering Machine'',
ICML 2001. http://www.ai.mit.edu/projects/jmlr/papers/volume3/marchand02a/marchand02a.pdf
\bibitem{Matthias}Matthias Seeger, {}``PAC-Bayesian Generalization Error Bounds for
Gaussian Process Classification'', Journal of Machine Learning Research
3 (2002), 233--269. http://www.dai.ed.ac.uk/homes/seeger/papers/seeger02a.ps.gz
\bibitem{Valiant}L.G. Valiant. {}``A Theory of the Leuarnable.'' Communications of
the ACM 27(11):1134-1142, November 1984
\bibitem{Vapnik}V. N. Vapnik and A. Y. Chervonenkis. {}``On the uniform convergence
of relative frequencies of events to their probabilities.'' Theory
of Probab. and its Applications, 16(2):264-280, 1971. \end{thebibliography}

\end{document}