% Sample LaTeX file for creating a paper in the Morgan Kaufmannn two
% column, 8 1/2 by 11 inch proceedings format.
%\documentstyle[ml99,named]{article}
%\documentstyle[ml2k,rotate,psfig,epsfig,times,mlapa]{article}
\documentstyle[ml2k,rotate,psfig,epsfig,times,mlapa]{article}
%\addtolength{\topmargin}{-0.7in}
\def\vx{\mathbf{x}}
\def\vd{\mathbf{d}}
\def\vw{\mathbf{w}}
\newcommand{\argmin}{\operatornamewithlimits{argmin}}
\newcommand{\argmax}{\operatornamewithlimits{argmax}}
\newcommand{\robust}{Robust}
%\renewcommand\floatpagefraction{1}\renewcommand\topfraction{1}\renewcommand\bottomfraction{1}\renewcommand\textfraction{0}
\input{code-macros.tex}
\newcounter{ecount}
\newenvironment{ecompact}{
\begin{list}%
{\arabic{ecount}}{\usecounter{ecount}
\parsep 0pt plus 1pt
\partopsep 0pt plus 1pt
\topsep 2pt plus 2pt minus 1pt
\itemsep 0pt plus 1pt}}%
{\end{list}}
\newcommand{\takeout}[1]{} %% use 'takeout' to comment regions
\newcommand{\percent}{{\% }} %% I can't stand the way xemacs font locks..
\newtheorem{theorem}{Theorem}
\newtheorem{prop}{Proposition}
\newcommand{\qed}{\hspace*{\fill}\rule{6pt}{8pt}\bigskip}
\takeout{
\newcommand{\steveenumeratec}{
\setlength{\topsep}{0pt}
\setlength{\partopsep}{0pt}
\setlength{\itemsep}{0pt}
\setlength{\parsep}{0pt}
\setlength{\leftmargin}{10pt}
\setlength{\rightmargin}{0pt}
\setlength{\labelwidth}{5pt}
\setlength{\labelsep}{3pt}
\setlength{\itemindent}{0pt}
\setlength{\listparindent}{0pt}}
\newcommand{\reviewtimetoday}[2]{\special{!userdict begin
/bop-hook{gsave 20 710 translate 45 rotate 0.8 setgray
/Times-Roman findfont 13 scalefont setfont 0 0 moveto
(#1) stringwidth (#1) show
/Times-Roman findfont 12 scalefont setfont 0 -14 moveto
(#2) stringwidth (#2) show
%3 2 roll exch sub 2 div 3 1 roll sub 2 div exch rmoveto (#2) show
grestore}def end}}
%\reviewtimetoday{\today}{Review Copy}
\def\senumeratec{%
\ifnum \@enumdepth >\thr@@\@toodeep\else
\advance\@enumdepth\@ne
\edef\@enumctr{enum\romannumeral\the\@enumdepth}%
\list
{\csname label\@enumctr\endcsname}%
{\steveenumeratec \usecounter\@enumctr\def\makelabel##1{\hss\llap{##1}}}% %\steveenumerate \steveenumeraterestore
\fi}
\let\endsenumeratec =\endlist
}
\begin{document}
%\title{Robust Learning with FeatureBoost}
\twocolumn[
%\icmltitle{FeatureBoost: A Meta Algorithm for Unabridged \\
%Learning that Improves Model Robustness}
\icmltitle{FeatureBoost: A Meta-Learning Algorithm \\
that Improves Model Robustness}
\icmlauthor{Joseph O'Sullivan}{josullvn@cs.cmu.edu}
\icmlauthor{John Langford}{jcl@cs.cmu.edu}
\icmlauthor{Rich Caruana}{caruana@cs.cmu.edu}
\icmlauthor{Avrim Blum}{avrim@cs.cmu.edu}
\icmladdress{School of Computer Science, Carnegie Mellon University, 5000 Forbes Ave., Pittsburgh, PA 15213}
\vskip 0.3in
]
\renewcommand{\baselinestretch}{0.958} \normalsize
\begin{abstract}
Most machine learning algorithms are lazy: they extract from the
training set the minimum information needed to predict its labels.
Unfortunately, this often leads to models that are not robust when
features are removed or obscured in future test data. For example, a
backprop net trained to steer a car typically learns to recognize the
edges of the road, but does not learn to recognize other features such
as the stripes painted on the road which could be useful when road
edges disappear in tunnels or are obscured by passing trucks. The net
learns the minimum necessary to steer on the training set. In
contrast, human driving is remarkably robust as features become
obscured. Motivated by this, we propose a framework for {\em robust}
learning that biases induction to learn many different models from the
same inputs. We present a meta algorithm for robust learning called
FeatureBoost, and demonstrate it on several problems using backprop
nets, k-nearest neighbor, and decision trees.
\end{abstract}
\takeout{
\begin{abstract}
Machine learning algorithms are {\em lazy}. They extract the minimum
amount of information from the training set needed to accurately
predict the labels of the training set. Once an adequate model is
learned, they do not learn other models that would make them more
robust to features that are removed or obscured in future test cases.
For example, a backprop net trained to steer a car typically learns to
recognize the edges of the road, but does not learn to recognize other
road features that could be useful for steering such as the stripes
painted on the road, signs and markers, other vehicles, etc. The net
learns the minimum necessary to steer on the training set, and fails
when road edges are obscured by passing trucks or disappear on bridges
and in tunnels. In contrast, human driving is remarkably robust as
some features become obscured. Motivated by this, we propose a
framework for {\em unabridged} learning that biases induction to learn
many different models from the same inputs. We present a meta
algorithm for unabridged learning called FeatureBoost, and demonstrate
it on several problems using backprop nets, k-nearest neighbor, and
decision trees.
\end{abstract}
}
\takeout{
\begin{abstract}
An artificial neural net trained to steer a car typically learns to
recognize the left and right edges of the road. It does not learn to
recognize: stripes painted on the road; signs and markers; other
vehicles; how the road curves in the distance; trees; people; etc.
The net learns little about roads, just the minimum necessary to steer
accurately on the training set. It steers well when road edges are
visible, but fails when they are obscured by passing trucks, or
disappear as on bridges and in tunnels. In contrast, human driving is
remarkably robust when some features get obscured. In this paper, we
argue that machine learning algorithms are {\em lazy}. They extract
the minimum amount of information from the training set needed to
predict the labels. Once an adequate model is learned, they do not
learn other models that would make them more robust to features that
are removed or obscured in future test cases. Motivated by this, we
propose a framework called {\em unabridged} learning that biases
induction to learn many different models from the same inputs. We
present a meta algorithm for unabridged learning, FeatureBoost, and
demonstrate it on several problems using backprop nets, k-nearest
neighbor, and decision trees.
\end{abstract}
}
\takeout{
\begin{abstract}
PAC learning typically assumes that the training and test sets are
drawn from the same distributions. This assumption often is violated
in practice. How can machine learning algorithms be biased to output
hypotheses that are robust to alterations in the test distribution?
We create a framework for learning in environments where the test and
training distributions differ because features in the test sets are
likely to be missing or corrupted. Motivated by this framework, we
present a new meta-learning algorithm, FeatureBoost. We demonstrate
FeatureBoost on three learning problems using backprop nets, k-nearest
neighbor, and decision trees.
\end{abstract}
}
\section{Motivation}
Consider a backprop net learning to steer a car. In the ALVINN system
\cite{alvinn} the principal internal features learned by ALVINN nets
detect the left and right edges of the road. Typically, ALVINN
nets do not learn internal features that detect other road phenomena
that could be useful for steering such as road centerlines, roadway
signs, trees, other traffic, people, etc. This creates a problem when
the left or right edges of the road are obstructed by passing
vehicles, or are missing as on bridges and in tunnels. Yet human
steering is remarkably robust to the loss of these features. Human
drivers can fall back on a number of alternate features as different
subsets of road features come in and out of view. Backprop nets can
learn to steer better if they learn to recognize other road features
such as centerlines \cite{centerlines}. How can we {\em force} backprop
nets to learn to use a variety of road features when learning to
steer?
A related problem arises in health care \cite{pneumonia}. Basic
inputs such as age, gender, and blood pressure are available for most
patients before they enter the hospital. Other measurements such as
RBC counts, oxygenation, and Albumin become available after patients
are hospitalized. As you would expect, models trained to predict
patient risk from both the pre and in-hospital features usually
outperform models trained to predict risk from only the pre-hospital
inputs. But these models perform poorly on patients not yet admitted
to the hospital when one marginalizes over the missing in-hospital
features. Models that use only the pre-hospital inputs are more
accurate for patients not yet admitted to the hospital than
marginalized models trained on all the features. How can we force the
learning algorithm to learn models that make better predictions when
some input features (such as the in-hospital attributes) are missing
for some test cases?
\takeout{ If models can be trained to make good predictions using only
the pre-hospital inputs, why don't models trained from all the
features work that well when the in-hospital features are missing?}
If the edges of the road, or the in-hospital features are always
available, models learned the usual way perform well. In the ALVINN
and health care problems above, the difficulty arises when features
are missing or obscured in the test cases. Boosting algorithms such
as AdaBoost are one way to make learned models more robust to feature
obscuration. If the main features such as the edges of the road are
obscured or missing from a few training cases, boosting places more
emphasis on these cases because they are predicted poorly. This
emphasis forces the learning algorithm to use other features such as
road centerlines for these cases. Unfortunately, boosting learns
about centerlines by strongly emphasizing the cases that are missing
road edges, even though centerlines may be visible in all images.
Boosting could learn about other features better if it used all of the
training data containing those features to learn about them. How can
we make boosting take full advantage of {\em all} the redundant
information in the training set?
\takeout{
In the ALVINN and health care problems above, the difficulty arises when
the training and test set distributions differ because features are
missing or obscured in the test set. In the real world, it is common
for training sets to come from different distributions. The distributions
differ for a variety of reasons:
\begin{ecompact}
\item Temporal drift in the underlying distributions with the test set
drawn later.
\item The training set is 'cleaned' more than future test sets.
\item The training sample is too small to capture all the phenomena
that will occur in future test sets.
\item Unanticipated uses of the models (e.g. use on partial
examples)
\end{ecompact}
}
\takeout{
This paper introduces a general framework for induction called {\em
robust learning}, which is motivated by our desire to model situations
where features in the test set may be corrupted or missing in ways not
adequately represented in the training set. Our goal is is to use the
framework to devise learning algorithms that are robust to different
forms of feature corruption. Guided by the framework, we devise a
meta-learning algorithm called FeatureBoost, that trains models to use
different subsets of features. Because the final prediction from
FeatureBoost combines the predictions of models that depend on
different (often overlapping) subsets of features, it is more robust
to missing or obscured features.
}
This paper introduces a general framework for induction called {\em
robust learning}, which is motivated by our desire to model situations
where features may be corrupted or missing in ways not adequately
represented in the training set. Guided by the framework, we devise a
meta-learning algorithm called FeatureBoost, that trains models to use
different subsets of features. Because the final prediction from
FeatureBoost combines the predictions of models that depend on
different (often overlapping) subsets of features, it is more robust
to missing or obscured features.
We develop the paper as follows:
\vspace*{-0.05in}
\begin{ecompact}
\item Present a general framework for robust learning.
\item Examine a specialization of this framework that
suggests one way to improve robustness.
\item Develop a meta-learning algorithm (FeatureBoost) inspired by
this model.
\item Test FeatureBoost on a variety of learning problems and machine
learning algorithms.
\end{ecompact}
\section{A Framework for \robust\ Learning}
\label{sec:unabridged}
Our basic goal is to force an ordinary ``base'' learning algorithm to
extract all the information it can from the training data, in order to
learn prediction rules that are robust to the possibility of missing
or corrupted features in test cases. To make this more precise, we
begin with a theoretical model that, while not perfect, is a useful
way of thinking about the problem, and motivates the algorithm given
in Section~\ref{sec:featureboost}.
As in the usual PAC model, we assume that our training examples have
$n$ features and are given to us from some fixed distribution $D$ over
the input space. We assume there is some target concept $c$ we wish
to learn, and to do this we have access to a ``base'' learning
algorithm that chooses hypotheses from some hypothesis class $H$. For
simplicity, we will fix some arbitrary error cutoff $\epsilon$ and say
a hypothesis $h \in H$ is ``good'' if its error is $\leq \epsilon$,
and is ``bad'' otherwise.
We begin by formalizing the notion that the training data contains
useful redundant information. Specifically, we say that a set of
hypotheses $H'$ is {\em $k$-robust} if, for any subset of $k$ of the
$n$ features, there is some $h \in H'$ that remains good even when
those $k$ features are corrupted. For the purposes of this model, we
do not need to pin down precisely what ``corrupted'' means
\takeout{
(e.g., are the features set to a default value, or given random
values, or something else)
}
so long as ``error'' is well-defined, and
for any hypothesis $h$, its error is non-decreasing as additional
features become corrupted. (I.e., error tends to increase as more
features become corrupted.) If $h$ is a good (low error) hypothesis
when no features are corrupted but becomes bad (high error) when a
subset $S$ of features are corrupted, we say that $S$ {\em destroys}
$h$.
Given access to examples from $D$ labeled according to $c$, the goal
of our algorithm will be to produce a $k$-robust subset $H' \subseteq
H$ if one exists. That is, instead of producing a single hypothesis,
we want our algorithm to produce a {\em set} of hypotheses such that
no matter what $k$ features are corrupted, at least one of the
hypotheses is still good.\footnote{Of course, ideally we would like a
single $k$-robust rule, perhaps a weighted vote among hypotheses in
$H'$ --- and in fact, this is our goal in the experimental section of
the paper. However, this goal appears to be trickier to model
theoretically, so we consider here the weaker goal of producing a
$k$-robust set.} To achieve our goal, we assume that our base
learning algorithm $A$ has the property that if we feed it labeled
examples from $D$ with some subset of features corrupted, it will then
produce a good $h \in H$ (with respect to the subset of features
corrupted) if one exists.
We can say several things about this setup. First, there is a natural
brute-force method that achieves our goal by making $n \choose k$
calls to $A$: for each set $S$ of $k$ features, feed data into $A$ in
which those $k$ features are corrupted, and add the hypothesis
produced by $A$ into $H'$. If $A$ ever fails to output a good
hypothesis, we know that $H$ was not $k$-robust and therefore no
$k$-robust $H' \subseteq H$ exists.
The brute-force algorithm works but is impractical because the
powerset of features is exponential in the number of features; ideally
we want an algorithm whose running time is polynomial in $k$.
Unfortunately, if $H$ is ``perverse'', this may not be possible.
Consider, for instance, the case that $H$ contains $n
\choose k$ hypotheses, each of which depends on a different subset of
$n-k$ features and each changes from good to bad if even just one of
those is corrupted. In this case, the only $k$-robust subset of $H$
is $H$ itself.
On the other hand, there is a natural strategy for ``non-perverse''
$H$ which can be proved to make only polynomially many calls to $A$ in
certain special cases. The strategy is as follows:
\begin{ecompact}
\item Initialize $H' = \{\}$, $S = \{\}$.
\item While not done, do:
\begin{ecompact}
\item Find the smallest set $S$ of features that destroy all $h \in
H'$.\footnote{Algorithmically, the way we find $S$ would depend on the
kind of hypotheses we are considering, but in the worst case we could
use brute force to try all $S$ of size 1, then of size 2 and so on.}
\item Run $A$ on examples from $D$ in which features in $S$ are
corrupted, and place the hypothesis found into $H'$. If $A$ fails,
then halt with failure.
\end{ecompact}
\end{ecompact}
We can make several statements about this algorithm.
\begin{theorem}
Suppose every good hypothesis $h\in H$ has an associated feature set
$S_h$ such that $h$ is good if and only if at least one feature in
$S_h$ remains uncorrupted. Then, this algorithm makes at most $k$
calls to $A$ and runs in linear time per iteration.
\end{theorem}
{\bf Proof:} We start with $S = \{\}$ and each time a new $h$ is added
to $H'$, we let $S \leftarrow S \bigcup S_h$. By assumption, $S$ will
always be the smallest set of features that destroys all of $H'$. Each
iteration increases the size of $S$ by at least 1, so the number of
iterations is at most $k$.\qed
\begin{theorem}
Suppose every good hypothesis $h\in H$ has an associated feature set
$S_h$ such that $h$ is good if and only if at least {\em two} features
in $S_h$ remain uncorrupted. Then, the above algorithm makes at most
$O(k^2)$ calls to $A$.
\end{theorem}
{\bf Proof:} Consider a graph $G$ with one node for each of the $n$
features, in which there are initially no edges. At each iteration of
the algorithm, put an edge into $G$ between every pair of uncorrupted
features in $S_h$ (unless the edge is there already), where $h$ is the
new hypothesis produced. Notice that the set $S$ produced in step
2.1 must, by assumption, be a vertex cover (a set of vertices
covering all the edges), and every iteration adds at least one new
edge into the graph. Therefore, the algorithm is done when there is
no longer a vertex cover of size $\leq k$. The theorem follows from
the twin facts that (a) no node will reach degree greater than $k+1$
(at that point it must be in the set $S$), and (b) the size of the
minimum vertex cover is at most the size of the maximum matching in
the graph.
\qed
The algorithm we develop in Section~\ref{sec:featureboost} can be
thought of as a more practical version of this strategy, for real data
and for our real goal of producing a single robust hypothesis.
\section{A Special Case of Robust Learning}
\label{sec:model}
We now consider a specialization of the robust learning framework
presented in the preceding section. In particular, we wish to
motivate the idea of using a majority vote to boost robustness.
As in the previous section, assume there exists some unknown
underlying distribution, \( D \), from which \( m \) examples with \(
n \) features, \( x^m=(x_{1},...,x_{n})^m \) are drawn. These
examples are labeled according to an unknown target function, \(
y=c(x) \).
Further assume that $q$ disjoint subsets of the features can predict the label:
$\exists c_{1},\ldots,c_{q}$ such that $y$
$=$
$c_{1}(x_{1},\dots,x_{i_1})$
$=$
$c_{2}(x_{i_1+1},\ldots,x_{i_2})$
$=$
$\dots$
$=$
$c_{q}(x_{i_{q-1}+1},\ldots,x_{n})$.
Co-training makes the same assumption with only 2 feature sets \cite{cotrain}.
\takeout{For simplicity, assume that each of the \( m \) features can predict
the label, \( y=c_{1}(x_{1})=c_{2}(x_{2})=...=c_{m}(x_{m}) \).
Equivalently, we could assume that disjoint subsets of the features
can predict the target, but that complicates the discussion.
Co-training~\cite{cotrain} makes a similar assumption.}
The learning algorithm is presented with \( m \) labeled examples, \(
(x,y)^{m} \), and asked to produce a hypothesis, \( h:X\rightarrow Y
\), used for future predictions. The goal is to minimize the
error \( e(h)=P_{D}(h(x)\neq c(x)) \).
\takeout{disagreement between \( h(x) \) and \( c(x) \) according to some loss
function such as squared error or Hamming loss. We focus on the
Hamming loss function,}
\takeout{Assume that an unbiased estimate of the error under the unknown
distribution \( D \) can be found. } We wish to understand how the
error changes with alteration to the test distribution. Specifically,
consider a test distribution in which some percentage of the disjoint
feature sets are corrupted uniform randomly. Corruption of a feature
set in an example, \( x \), is accomplished by picking a second
example, \( z \), from \( D \) and substituting the values of the
feature set from \( z \) into \( x \). This approach leaves the
marginal distribution of feature set values unchanged by corruption.
There are two extremes to consider. The first is an Occam's razor
motivated learner such as a decision tree, which attempts to find a
small set of features that can predict the label. In the extreme, if
we assume a decision tree focuses on a particular \( 1 \) of the \( q
\) feature sets, then when \( k\% \) of the feature sets are
corrupted, the decision tree is affected \( k\% \) of the time. If
there are just two labels and the decision tree outputs a random value
when it relies on a corrupted feature, this results in error linearly
increasing from \( e(h) \) to \( 50\% \) as \( k \) increases from \(
0\% \) to \( 100\% \).
The second extreme occurs when the model uses all features. It is
difficult to state how this will effect accuracy without making
assumptions about the learning algorithm. Assume that a decision tree
is used on each disjoint feature set with the resulting tree outputs
passed through a majority voting function and that errors by one
decision tree are independent of errors by all other decision trees.
From \( k = 50\%-100\% \), the error will increase from \( e(h) \) to
\( 50\% \) at a rate dependent on the value of \( q \), the number of
feature sets. Each decision tree error can be viewed as flipping a
random coin with some bias. If enough coins come up with the wrong
label, they will overwhelm the good predictors in the vote. The
probability of error is distributed as the cumulative distribution of
a binomial tail.
The algorithm for the second extreme dominates the first one, always
doing at least as well under any \( k\% \) feature set corruption and
often doing significantly better. This motivates us to create a
learning algorithm using a mixture of experts with each expert
focusing on a different set of features.
\takeout{The second extreme case occurs when the model uses all features to
predict the label. It's difficult to know how this will effect
accuracy without making assumptions about the learning algorithm.
Assume that a decision tree is used on each feature with the resulting
tree outputs passed through a majority voting function and that errors
by one decision are independent of errors by all other decision trees.
From \( k = 0\%-100\% \), the error will increase from \( e(h) \) to
\( 50\% \) at a rate dependent on the value of \( m \), the number of
features. Each decision tree error can be viewed as flipping a random
coin with some bias. If enough random coins come up with the wrong
label, they will overwhelm the good predictors in the vote. The
probability of error is distributed as the cumulative distribution of
a binomial tail.
The second learning algorithm dominates the first one, always doing at
least as well under any \( k\% \) feature corruption and often doing
significantly better. We are motivated to create a learning algorithm
using a mixture of experts with each expert focusing on different
features.}
\section{FeatureBoost}
\label{sec:featureboost}
The algorithm we present is a variant of boosting where features are
boosted rather than examples. In boosting, a base learning algorithm
{\sc Learn} is called multiple times. Each time it is presented with
a different distribution over the training examples. After each step,
this distribution is altered to increase the emphasis on ``harder''
parts of the space. At the end, the different hypotheses are combined
into a single hypothesis. Boosting algorithms alter the distribution
by emphasizing particular training examples. FeatureBoost alters the
distribution by emphasizing particular features. Think of examples as
a matrix where each row is an example. Where boosting emphasizes rows
(examples) in the matrix, FeatureBoost emphasizes columns (features)
in the matrix instead. Similar work with k nearest neighbors has
appeared \cite{knnrandomfeatsel}, though the goals and algorithms
differ considerably.
The goal of FeatureBoost (see Table~\ref{t:featureboost}) is to search
for alternate hypotheses amongst the features. A distribution over
{\em features} $p_t$ is updated at each iteration $t$ by conducting a
sensitivity analysis on the features used by the model learned in the
current iteration. The distribution is used to increase the emphasis
on unused features in the next iteration in an attempt to produce
different sub-hypotheses. This is repeated, and the sub-hypotheses
are combined into a meta-hypothesis that should be more robust. The
intuition behind this combination is as for AdaBoost
\cite{AdaBoost} (though we are unable to provide strong
theoretical guarantees as for AdaBoost). The update factor
$\beta_t$ decreases with $\epsilon_t$, and in turn increases the
weight $\ln(1/\beta_t)$ associated with the final hypothesis. Thus,
more accurate hypotheses have more influence on the final
meta-hypothesis.
\begin{table}[t]
\caption{Pseudocode describing the FeatureBoost algorithm.}
\label{t:featureboost}
\vskip 0.15in
\begin{center}
\codebox{%
\< $\procdecl{FeatureBoost}((x,y)^M,\proc{Learn},T)$
\\ {\bf Input}: \>\> $(x,y)^M$: $M$ examples with $N$ features
\\ \>\>$\proc{Learn}$: Learning algorithm
\\ \>\>$T$: Number of iterations
\\ \li \For $i \in \{1...N\}$ {\bf do} $w^1_i$ $\gets$ $\frac{1}{N}$
\\
\\ \li \For $t$ $\in$ $\{1...T\}$ {\bf do}
\\ \li \> $p^t$ $\gets$ $w^t/{\sum^N_{i=1} w^t_i}$
\\ \li \> $(dx,y)^M,\epsilon_{fm}$ $\gets$
\\ \> \hspace{0.1in} $\proc{Deemphasize}((x,y)^M,p^t,t,T,500,\epsilon_{t-1},h_{t-1})$
\\ \li \> $h_t$ $\gets$ $\proc{Learn}((dx,y)^M)$
\\ \li \> $I^t$ $\gets$ $\proc{Importance}((x,y)^M,h_t,500)$
\\ \li \> $\epsilon_t$ $\gets$ $\sum^M_{i=1} \mid h_t(x_i) - y_i \mid$
\\ \li \> $\beta_t$ $\gets$ $max({\epsilon_t}/({1-\epsilon_t}),0.01)$
\\ \li \> \For $n \in \{1...N\}$ {\bf do}
$w^{t+1}_n$ $\gets$ $w^{t}_n$ + $I^t_n*(\epsilon_{fm} - \epsilon_t)$
\\
\\ {\bf Output}: \>\>~$h_f(i) = \sum^T_{t=1}(\log \frac{1}{\beta_t})h_t(i)
\ge
\frac{1}{2}\sum^T_{t=1}\log \frac{1}{\beta_t}$
}
\end{center}
\vskip -0.35in
\end{table}
We calculate the importance of individual features by repeatedly
picking a random training example and assigning a random value to the
feature according to the distribution of values for that feature in
the training set. The error of the hypothesis on this example is then
calculated. After many iterations (we use $500$), the change in the
average error of the hypothesis is detected. These error changes
yield an importance vector over features that is scaled to have
entries in $ (0,1) $. Pseudocode for the importance calculation is
provided in Table~\ref{t:importance}.
\begin{table}[t]
\caption{Pseudocode describing importance calculation algorithm}
\label{t:importance}
\vskip 0.15in
\begin{center}
\codebox{%
\< $\procdecl{Importance}((x,y)^M, h, T)$
\\ {\bf Input}: \>\> $(x,y)^M$: $M$ examples with $N$ features
\\ \>\>$h$: A hypothesis labeling examples
\\ \>\>$T$: Number of iterations
\\ \li \For $n \in \{1...N\}$ {\bf do} $c_n$ $\gets$ $0$
\\
\\ \li \For $n$ $\in$ $\{1...N\}$ {\bf do}
\\ \li \> \For $t$ $\in$ $\{1...T\}$ {\bf do}
\\ \li \> \> $x_1$ $\gets$ a random example from $x^M$
\\ \li \> \> $v$ $\gets$ $h(x_1)$
\\ \li \> \> $x_2$ $\gets$ a random example from $x^M$
\\ \li \> \> $x_1[n]$ $\gets$ $x_2[n]$
\\ \li \> \> \If $h(x_1) \neq v$ {\bf then}
\\ \li \> \> \> $c_n$ $\gets$ $c_n + 1$
\\ \li \For $n \in \{1...N\}$ {\bf do}
$c_n \gets c_n / T$
\\
\\ {\bf Output}: \>\>\> $c$: the ``importance'' of each feature
}
\end{center}
\vskip -0.35in
\end{table}
\takeout{The importance of the individual features is then used to update the
distribution over the features in step 8 of FeatureBoost. Intuitively,
we want to create a difference between the distributions $p^t$ and $p^{t+1}$
so that {\bf john, say this right?: individual hypotheses would be denser
around ``hard'' feature sets, and sparse around ``easy'' feature sets.} }
We have experimented with several approaches to the {\sc Deemphasize}
function for biasing {\sc Learn} by the distribution over the features
$p_t$ in Step 4 of Table~\ref{t:featureboost}. Options range from
``hard'' (e.g., removing features) to ``soft'' (e.g., scaling or
adding noise to individual features). In this paper we use a ``hard''
method for progressive feature removal that is applicable to any
learning algorithm.
We calculate the current full marginalization error $\epsilon_{fm}$ from
$h_{t-1}$ before calling {\sc Learn}. $\epsilon_{fm}$
is defined as the error when every feature is picked randomly. This
gives an upper bound on the error of $h_{t-1}$. We then marginalize
features from the training data, weighted by the distribution over the
features, until the marginalized performance of $h_{t-1}$ is worse
than \( ((T-t)*\epsilon_{t-1} + t*\epsilon_{fm})/T \). This process
seeks a feature set with marginalized performance worse than
$\epsilon_{t-1}$, and approaching $\epsilon_{fm}$. In practice, we
observe that occasionally it is too easy or too hard to do worse than
this threshold, so we also require that at least 15\percent\ of the
features are removed, and at least 10\percent\ of the features remain.
The pseudocode for the {\sc Deemphasize} algorithm and the helper
function {\sc Marginalize} are provided in Table~\ref{t:deemphasize}
and Table~\ref{t:marginalize}.
\begin{table}[!htp]
\vskip -0.2in
\caption{Pseudocode describing marginalize algorithm}
\label{t:marginalize}
\vskip 0.15in
\begin{center}
\codebox{%
\< $\procdecl{Marginalize}((x,y)^M, p, K, S, h)$
\\ {\bf Input}: \>\> $(x,y)^M$: $M$ examples with $N$ features
\\ \>\>$p$: A distribution over features
\\ \>\>$k$: number of features to marginalize over
\\ \>\>$S$: Total number of statistical iterations
\\ \>\>$h$: previous hypothesis
\\ \li $\epsilon_{m}$ $\gets$ $0$
\\
\\ \li \For $t$ $\in$ $\{1...S\}$ {\bf do}
\\ \li \> $x_1$ $\gets$ random example from $x^M$
\\ \li \> $v$ $\gets$ $h(x_1)$
\\ \li \> \For $n$ $\in$ $\{1...K\}$ {\bf do}
\\ \li \> \> $f$ $\gets$ draw feature without replacement from $p$
\\ \li \> \> $x_2$ $\gets$ random example from $x^M$
\\ \li \> \> $x_1[f]$ $\gets$ $x_2[f]$
\\ \li \> \If $h(x_1) \neq v$ {\bf then}
\\ \li \> \> $\epsilon_{m}$ $\gets$ $\epsilon_{m} + 1$
\\ \li $\epsilon_{m}$ $\gets$ $\epsilon_{m} / S$
\\
\\ {\bf Output}: \>\>\> $\epsilon_m$: the marginalized error
}
\end{center}
\vskip -0.35in
\end{table}
\begin{table}[!htp]
\caption{Pseudocode describing deemphasize algorithm}
\label{t:deemphasize}
\vskip 0.15in
\begin{center}
\codebox{%
\< $\procdecl{Deemphasize}((x,y)^M, p, t, T, S,\epsilon,h)$
\\ {\bf Input}: \>\> $(x,y)^M$: $M$ examples with $N$ features
\\ \>\>$p$: A distribution over features
\\ \>\>$t$: iteration number
\\ \>\>$T$: Total number of featureboost iterations
\\ \>\>$S$: Total number of statistical iterations
\\ \>\>$\epsilon$: previous hypothesis error
\\ \>\>$h$: previous hypothesis
\\ \li $\epsilon_{fm}$ $\gets$ $\proc{Marginalize}((x,y)^M,p,N,S,h)$
\\ \li $\epsilon_{m}$ $\gets$ $\epsilon$
\\ \li $K$ $\gets$ $0$
\\
\\ \li \While $\epsilon_m < (T-t)*\epsilon + t*\epsilon_{fm}$ {\bf and} $\frac{K}{N} < 0.90$
\\ \> \> \>{\bf or} $\frac{K}{N} < 0.15$ {\bf do}
\\ \li \> $K$ $\gets$ $K+1$
\\ \li \> $\epsilon_m$ $\gets$ $\proc{Marginalize}((x,y)^M,p,K,S,h)$
\\ \li $(dx,y)^M$ $\gets$ $(x,y)^M$ with $K$ features removed using $p$
\\
\\ {\bf Output}: \>\>\> $(dx,y)^M, \epsilon_{fm}$
}
\end{center}
\vskip -0.35in
\end{table}
\section{Empirical Results}
We now demonstrate FeatureBoosting of artificial neural nets ({\sc
ANN}), k-nearest neighbor ({\sc kNN}), and decision trees ({\sc DT}).
For {\sc DT} we used IND \cite{buntine}. For {\sc ANN} we use
three layer backprop nets with 5 hidden units, conjugate gradient
descent, and early stopping with hold-out sets. For {\sc kNN} we use
unweighted Euclidean distance with $k=2$. We will contrast
FeatureBoost with the meta-learning algorithms {\sc Mixture} (a simple
mixture of experts) and {\sc AdaBoost} \cite{AdaBoost}.
%%\ref{f:res2}
\begin{figure*}[t]
\vskip 0.2in
\begin{tabular}{rccc}
& {\sc kNN}&{\sc ANN}&{\sc DT} \\
\rotate[l]{\hspace{0.2in} \sc Pneumonia}
&
\hbox{\psfig{file=knn_pneumoniaa.eps,width=2.0in,height=1.4in}} &
\hbox{\psfig{file=nn_pneumoniaa.eps,width=2.0in,height=1.4in}} &
\hbox{\psfig{file=ind_pneumoniaa.eps,width=2.0in,height=1.4in}} \\
\rotate[l]{\hspace{0.2in} \sc Threshold}&
\hbox{\psfig{file=knn_thresholda.eps,width=2.0in,height=1.4in}} &
\hbox{\psfig{file=nn_thresholda.eps,width=2.0in,height=1.4in}} &
\hbox{\psfig{file=ind_thresholda.eps,width=2.0in,height=1.4in}} \\
\rotate[l]{\hspace{0.3in} \sc Voting} &
\hbox{\psfig{file=knn_votinga.eps,width=2.0in,height=1.4in}} &
\hbox{\psfig{file=nn_votinga.eps,width=2.0in,height=1.4in}} &
\hbox{\psfig{file=ind_votinga.eps,width=2.0in,height=1.4in}} \\
\end{tabular}
\vskip 0.1in
\caption{Percent improvement over learning in three
domains ({\sc Pneumonia}, {\sc Threshold}, and {\sc Voting}) for three
learners ({\sc kNN}, {\sc ANN} and {\sc DT}) and for three ways
of generating experts. Results are presented with 95\percent
confidence intervals from 100 or more trial using 20
experts. The x axis is the percent random corruption of the individual
features. The y axis is the percent difference between the error of
{\sc Learn} and the error of the other methods.\label{f:res2}}
\vskip -0.2in
\end{figure*}
We test each meta-learning algorithm on three domains: the UCI Vote
domain, a real pneumonia problem, and a synthetic problem we created
to demonstrate FeatureBoosting. The Vote domain consists of a
congressman's voting record on 16 votes. The label is the party,
democrat or republican, which the congressman belongs to. In each
trial, 100 examples were used for training, 335 for testing. The
Pneumonia domain uses real patient data consisting of 30 pre-hospital
and 35 in-hospital features \cite{pneumonia}. The label is the risk
of death. In each trial 1000 examples were used for training, 2000 for
testing. In the synthetic domain, {\sc Threshold}, a threshold
function is drawn uniformly from the interval \([25,75]\).
%% Why not from the interval [0,100]?
Examples are drawn uniformly from \([0,100] \) and example features are
encoded in several ways: with a Gray code, a ``peaks'' code, a binary
code, a unary code, and the value divided by \( 100 \). Gray code
is a similar to binary code except that only one feature changes at
a time when counting from \( 0 \) to \( 100 \). Peaks code consists
of \( 100 \) features with values calculated by a Gaussian centered
on the example value \cite{centerlines}. Unary code (``thermometer
code'') outputs as many \( 1 \)'s as the example value, then fills
in \( 0 \)'s to reach 100 features. 100 examples are used for training
and 300 for testing.
Our experiments investigate how robust each learning algorithm is to
random uniform feature corruption of the test data. For each case in
the test set, a random $n$ percent of the features are corrupted. The
features corrupted are choosen independently from case to case.
The results in Figure~\ref{f:res2} suggest that FeatureBoost dominates
AdaBoost and simple mixtures of experts when features in the test set
are corrupted. Simple mixtures of experts improve robustness because a
mixture of experts typically uses more features than {\sc Learn}. The
improvement is less than FeatureBoost, however, because a simple
mixture of experts does not focus learning to use different sets of
features. The graphs also suggest that the benefit of FeatureBoost is
most pronounced for {\sc DT} and least pronounced for {\sc kNN}. This
is expected because {\sc DT} is biased to use few features, whereas
{\sc kNN} with unweighted Euclidean distance uses all features.
\begin{figure*}[t]
\vskip 0.2in
\centerline{
%\hbox{\psfig{file=ind_largethresholda.eps,width=2.5in}}
%\hbox{\psfig{file=ind_largethresholdb.eps,width=2.5in}}
\hbox{\psfig{file=ind_large.eps,width=\textwidth,height=2.0in}}
}
\vskip 0.1in
\caption{The improvement from boosting {\sc DT} by FeatureBoost,
Mixture Models, and AdaBoost on {\sc Threshold}. On the left,
individual features are randomly corrupted. On the right, corruption
happens to entire {\em sets} of features. Results are presented with
95\percent confidence intervals from 100 trials using 20 experts. The
y axis is the percent difference between the error of {\sc Learn}
and the error of other methods (higher is better). \label{f:res1}}
\vskip -0.2in
\end{figure*}
The synthetic domain ({\sc Threshold}) allows us to corrupt features
in different ways. Here the input is encoded in several redundant
ways: with a Gray code, a ``peaks'' code, a binary code, a unary code,
and the value divided by \( 100 \). Thus there are multiple disjoint
subsets of features that can predict the target. Figure~\ref{f:res1}
shows the performance of FeatureBoost on {\sc Threshold} using
decision trees. The graphs compare the error of ({\sc DT}) to that of
the voting algorithms, and show how robust each method is to corrupted
test data. However, the test sets are now corrupted two different
ways; The left hand graph is as before, when each example has
percentage $n$ features selected at random for corruption. In the
right hand graph, each example has some percentage $n$ of the feature
{\em sets} (e.g. grey code, binary code, etc) corrupted. This feature
set corruption simulates clusters of features tending to be occluded
or corrupted together, as when parts of an image are occluded, or when
an event such as not being admitted to the hospital causes an entire
set of features to be missing. Both the AdaBoost and Mixture voting
methods are hit hard by such grouped mutilation --- FeatureBoost,
while weakened, still demonstrates its robustness.
\section{Discussion}
FeatureBoost addresses the problem of how to benefit from redundancy
in input features. Most machine learning methods are lazy and learn
``abridged'' models. Where there is redundancy, a learning algorithm
should be capable of learning accurate models using any of a number of
different subsets of features. But most do not. Instead, they learn
the simplest models that are accurate on the training data. Because
simple models often depend on fewer features, and fail to exploit
redundancy in the input features, they are not very robust when when
some features are missing, occluded, or corrupted.
FeatureBoost is a meta-learning algorithm that makes the underlying
learning algorithm less abridged by learning multiple models that use
different (possibly overlapping) sets of features. The ultimate
unabridged learning procedure would be to train separate models on the
power set of the input features, and combine these models weighted by
their estimated accuracy. This approach is impractical, of course,
for problems having more than a few attributes as the power set is too
large.
Despite the success of FeatureBoost on the test problems, there are
difficulties using it. The optimal schedule for biasing feature use
depends on the learning algorithm and target function. FeatureBoost
can be considered a heuristic search through the space of ``ideal''
subsets of features. The goal of this search is to find diverse sets
of explanations for the output label. If, as with the synthetic
threshold function, features are encoded with multiple redundant
disjoint subsets of inputs, we would not want use the same feature in
two different models. In real world problems this separation is rarely
so clean and it may be best to use overlapping subsets of features.
We presented a ``hard'' version of {\sc Deemphasize} in
FeatureBoost. We can also conceive of ``soft'' versions. All the
learning algorithms we examined allow other ways of deemphasizing features.
\begin{ecompact}
\item For Neural Nets, multiply the value of a feature by it's
emphasis. While it is theoretically possible that this alteration
does not change the learned hypothesis, in practice the hypothesis does
vary.
\item For K nearest neighbor, use a weighted inner product.
\item For decision trees, multiply the information gain of a feature
by that features weight before comparing it to other features.
\end{ecompact}
%% John: We didn't actually do the right thing for a decision tree.
%% Do we want to rerun the decision tree experiments doing the right
%% thing?
The difference between corrupted features and missing features is
important. If values are missing, a technique similar to ``sleeping
experts \cite{sleeping}'' may be more effective. In this setting,
there are many sub-hypotheses that taken together are combined to form
a more robust hypothesis using weighted majority where only hypotheses
with no missing values vote. A softer form of this where
sub-hypotheses vote with strength proportional to their confidence may
work well in real world settings.
\takeout{
Just as there are several algorithms for boosting examples, we believe
there will be several algorithms for boosting features. FeatureBoost
is one way of creating more robust models. FeatureBoost should be most
effective when applied to learning algorithms such as decision trees
that are biased to learn models using a small number of features:
decision trees often use only a fraction of the available features.
Our results, however, show that FeatureBoost also is effective with
artificial neural nets and kNN.
}
Just as there are several algorithms for boosting examples, we believe
there will be several algorithms for boosting features. In fact, we
can augment AdaBoost to make it more robust. If we corrupt {\em
training} examples before using AdaBoost, AdaBoost eventually places
emphasis on examples where the main features have been corrupted, and
thus learns models that depend on other features. We find that
AdaBoosting corrupted training sets performs well when the corruption
in the training set is identical to the corruption in the test set,
but can perform poorly when the test set is corrupted differently than
the training set.
\takeout{AdaBoosting corrupted training data is not always
as robust as we would like.}
Combining AdaBoost with FeatureBoost may
yield the best of both methods. Note that while we have begun to
develop a theoretical framework for robust learning, we do not yet
have theoretical guarantees for FeatureBoost comparable to those for
AdaBoost.
\section{Related Work}
Pomerleau noted that ALVINN nets learned models that failed when
unexpected new features arose (guardrails) or expected features were
obscured (road edges):
\begin{quote}
``The appearance or disappearance of irrelevant features can disrupt a
network's driving when the network's training did not demonstrate
their irrelevance\ldots While they may obscure or replace features
which appear on a `normal' image, there remain enough features in
the image to at least in theory make driving using the same network
possible\ldots The reason that this type of transitory disturbance
causes trouble is that each network is trained over a relatively short
stretch of road ($<2$ miles). As a result, during training the network
is not exposed to all possible driving situations it might encounter
when driving autonomously'' \cite{alvinn}.
\end{quote}
Clearly our motivation for robust learning is very similar to
Pomerleau's. He devised a method called ``structured noise'' that
bears some resemblance to the process of corrupting selected features
with noise in FeatureBoost. His structured noise method added or
subtracted (occluded) localized 2D regions in road images in the
training set as a way of biasing the network to be more robust to
transitory feature inclusion and occlusion. Unlike the FeatureBoost
meta-algorithm, his goal was to train one network to be more robust,
not to train multiple models to use different subsets of features. We
suspect that FeatureBoost could benefit from Pomerleau's method of
corrupting localized 2D regions in images when applying FeatureBoost
to image recognition problems.
\takeout{
One of the reasons why we compare FeatureBoost to AdaBoost is because
AdaBoost should improve robustness in some circumstances. Suppose
when training ALVINN nets to steer, most of the training data contains
unobstructed views of the left and right sides of the road, and only a
few training examples have obstructed views of the sides of the road.
The first models trained with AdaBoost will use the sides of the road
as their main features. AdaBoost, however, will then focus it's
effort on training examples where these models fail. In these training
examples, the centerlines may be the next best features. AdaBoost
probably will not learn to use centerlines as well as FeatureBoost,
however, because AdaBoost will learn about centerlines only from the
small fraction of training examples in which edges are not visible. If
centerlines are visible in most images, including those with
unobstructed views of road edges, FeatureBoost will learn about
centerlines using the full training set. Combining AdaBoost with
FeatureBoost may yield the best of both methods. Note that while we
have begun to develop the theoretical framework for unabridged
learning and FeatureBoost, we do not yet have theoretical guarantees
for FeatureBoost comparable to those for AdaBoost.
}
Robust learning is related to the UAV (unspecified attribute value)
\cite{GKS97} and RFA (restricted focus-of-attention) \cite{BD98}
models of learning studied in the Computational Learning Theory
literature. The UAV model is a query model in which both the training
and test data may have missing features, and given a
partially-specified example, the goal is to answer whether or not all
completions of it have the same label (and to output the correct label
if they do). In RFA learning, the learning algorithm is only allowed
to examine a small number of features in each training example, and
its goal is to produce a hypothesis that has low PAC-style error over
fully specified data. Our framework is similar to the RFA setup,
except the roles of training and test data are reversed, and we do not
assume that the learning algorithm can choose which features are missing.
\section{Future Work}
The framework we present for robust learning does not encompass all
ways in which input features might differ in the train and test
distributions. FeatureBoost assumes that this difference can be
modelled as random corruption and elimination of selected features.
While this is an effective means of focussing the learner's attention
{\em away} from some features, it probably is not an accurate model of
how features get corrupted in the real world. For example, features
in images tend to be corrupted in connected regions. In medicine,
procedures may return results for dozens of measurements at a time.
In both examples, features come or go in {\em clumps}. The way in which
corrupted features clump depends on the domain. It may be useful to
devise specialized versions of robust learning where the data
corruption model better fits the processes that affect features in
each domain.
An alternate approach to robust learning that we are examining uses
feature selection to train models on compact sets of features. It
then removes the selected features, and trains additional models on
the remaining features. This approach may better model domains such
as medicine where features being missing is more common than features
being corrupted or occluded.
\takeout{
Processes that corrupt or hide features are not the only models for
robust learning. Robust learning might also be thought of as PAC
learning with a loss function that varies with the examples. For
example, when learning to drive a car, you don't care so much about
how perfectly smooth the car drives in normal situations, but you care
very much about how well it avoids crashes in difficult situations. A
learning algorithm that is told which examples are riskier might also
create more robust hypotheses.
}
FeatureBoost, which is based on one model of robust learning, has
limitations. One limitation is that the current version of
FeatureBoost is restricted to classification problems. This is why we
were unable to test FeatureBoost on the autonomous vehicle steering
problem that initially motivated our investigation. We are currently
developing an extension to FeatureBoost that can handle regression
problems.
We are not yet able to provide theoretical guarantees for FeatureBoost
similar to those available for boosting algorithms such as AdaBoost.
Part of the difficulty stems from the fact that robust learning is
most useful when the train and test distribution differ in ways that
may be difficult to characterize. One of our goals is to find
restricted (though possibly less realistic) models of robust learning
where we will be able to make strong theoretical guarantees.
\section{Summary}
Most machine learning algorithms extract the minimum information from
the training set they need to accurately predict the labels. Once they
learn a model that performs well on the training data, they stop
learning. This {\em laziness} makes the models they learn less robust
on future test cases that have missing or occluded features.
Often there is redundancy in the inputs that could have been exploited
to learn models more robust to the loss or corruption of some input
features. A learning algorithm that was less lazy would try to learn
as many different models from the available inputs as possible.
Motivated by this, we introduced a general learning framework called
{\em robust learning}. We then presented a meta-learning algorithm
called FeatureBoost that makes the machine learning algorithm it
is applied to more robust. We demonstrated FeatureBoost with three
different learning methods: backprop nets, k-nearest neighbor, and
decision trees. The results suggest FeatureBoost can use these
methods to learn models that are more robust to the loss of important
features in the test data.
\bibliography{ml2krobust}
\bibliographystyle{mlapa}
\end{document}
### scrap
\takeout{
\begin{table}[t]
\caption{
\label{t:res1}
\begin{center}
\small{
\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|c|}
\hline
Domain & Corrupt & \multicolumn{3}{|c|}{\sc kNN}&\multicolumn{3}{|c|}{\sc Neural}&\multicolumn{3}{|c|}{\sc IND} \\
& & F & A & M & F & A & M & F & A & M \\
\hline
&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
{\sc Pneumonia}&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
\hline
&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
{\sc Threshold}&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
\hline
&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
{\sc Voting} &0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent&0.0\percent\\
\hline
\end{tabular}
}
\takeout{
\small{
\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|c|}
\hline
Domain & Corruption & \multicolumn{3}{c}{\sc }&\multicolumn{3}{c}{\sc Neural}&\multicolumn{3}{c}{\sc IND} \\
& & Feature & AdaBoost & Mixture & Feature & AdaBoost & Mixture & Feature & AdaBoost & Mixture \\
\hline
& & & & & & & & & & \\
{\sc Pneumonia} & & & & & & & & & & \\
& & & & & & & & & & \\
\hline
& & & & & & & & & & \\
{\sc Syn-Threshold}& & & & & & & & & & \\
& & & & & & & & & & \\
\hline
& & & & & & & & & & \\
{\sc Voting} & & & & & & & & & & \\
& & & & & & & & & & \\
\hline
\end{tabular}
}
}
\end{center}
\end{table}
}
\takeout{
\codebox{%
\< $\procdecl{FeatureBoost}()$
\\ \< {\bf Input}:
\\ \>$N$ examples each containing $M$ features
\\ \>\> $N$ labels, one for each example
\\ \> Weak Learning algorithm ``Learn''
\\ \> Number of iterations, $T$
\\ \< {\bf Output}:
\\ \>A hypothesis, $H(x_1,...,x_M)$
\\ \< {\bf Algorithm}:
\\ \li $\{W_1,...,W_M\}$ $\gets$ $\{1/M,...,1/M\}$
\\ \li \For $t$ $\gets$ $1..T$
\\ \li \Do $\id{H[t]}$ $\gets$ $\proc{Learn}(e, l)$
}
}
\takeout{
\begin{minipage}[l]{2.5in}
\codebox{%
\< $\procdecl{AdaBoost}(e,D,Learn,T)$
\\ \< {\bf Input}:
\\ \< $N$ examples each containing $M$ features
\\ Distribution $D$ over examples (say, uniform)
\\ \< Weak Learning algorithm ``Learn''
\\ \< Number of iterations, $T$
\\ \< {\bf Algorithm}:
\\ \li $w^1_i$ $\gets$ $D(i)$, $i \in \{1..N\}$
\\ \li \For $t$ $\in$ $\{1..T\}$
\\ \li \Do $p^t$ $\gets$ $\frac{w^t}{\sum^N_{i=1} w^t_i}$
\\ \li \>\> $h_t$ $\gets$ $\proc{Learn}(p^t)$
\\ \li \>\> $\epsilon_t$ $\gets$ $\sum^N_{i=1} p^t_i \mid h_t(i) - c(i) \mid$
\\ \li \>\> $\beta_t$ $\gets$ $\epsilon_t/(1-\epsilon_t)$
\\ \li \>\> $w^{t+1}_i$ $\gets$ $w^{t}_i$ $\beta^{1-\mid h_t(i) - c(i) \mid}_t$
\\ \< {\bf Output}:
\\ \< $h_f(i) = \sum^T_{t=1}(\log \frac{1}{\beta_t})h_t(i)
\ge
\frac{1}{2}\sum^T_{t=1}\log \frac{1}{\beta_t}$
}
\end{minipage}
\begin{minipage}[r]{2.4in}
\codebox{%
\< $\procdecl{FeatureBoost}(e,D,Learn,T)$
\\ \< {\bf Input}:
\\ \< $N$ examples each containing $M$ features
\\ Distribution $D$ over features (say, uniform)
\\ \< Weak Learning algorithm ``Learn''
\\ \< Number of iterations, $T$
\\ \< {\bf Algorithm}:
\\ \li $w^1_i$ $\gets$ $D(i)$, $i \in \{1..M\}$
\\ \li \For $t$ $\in$ $\{1..T\}$
\\ \li \Do $p^t$ $\gets$ $\frac{w^t}{\sum^N_{i=1} w^t_i}$
\\ \li \>\> $h_t$ $\gets$ $\proc{Learn}(p^t)$
\\ \li \>\> $\epsilon_t$ $\gets$ $\sum^N_{i=1} p^t_i \mid h_t(i) - c(i) \mid$
\\ \li \>\> $\beta_t$ $\gets$ $\epsilon_t/(1-\epsilon_t)$
\\ \li \>\> $w^{t+1}_i$ $\gets$ $w^{t}_i$ $\beta^{1-\mid h_t(i) - c(i) \mid}_t$
\\ \< {\bf Output}:
\\ \< $h_f(i) = \sum^T_{t=1}(\log \frac{1}{\beta_t})h_t(i)
\ge
\frac{1}{2}\sum^T_{t=1}\log \frac{1}{\beta_t}$
}
\end{minipage}
}
\subsubsection*{Acknowledgements}
Use an unnumbered third-level heading for the acknowledgements. All
acknowledgements go at the end of the paper, just before the
references.
\subsubsection*{References}
References follow the acknowledgements. Use an unnumbered third-level
heading for the references. APA citation style is preferred, but any
choice of citation style is acceptable as long as you are consistent.
Alspector, J., Gupta, B.~\& Allen, R.~B.~(1989). Performance of a
stochastic learning microchip. In D. S. Touretzky (ed.), {\it Advances
in Neural Information Processing Systems 1}, 748-760. San Francisco, CA:
Morgan Kaufmann.
Rosenblatt, F.~(1962). {\it Principles of Neurodynamics.} Washington,
D.C.: Spartan Books.
Tesauro, G.~(1989). Neurogammon wins computer Olympiad. {\it Neural
Computation} {\bf 1}(3):321-323.
\end{document}