\documentclass[11pt]{article}
\usepackage{fullpage}
\usepackage{subfigure}
\usepackage{graphics}
\usepackage{epsfig}
\begin{document}
\thispagestyle{empty}
\parindent 0pt
\parskip 10pt
\begin{center}
\Huge\bf
Homework \#\,\,1 Answers\\
\Large\bf
15-486/782: Artificial Neural Networks\\
Dave Touretzky, Fall 2006\\[.2in]
\end{center}
{\bf Problem 1.} Using inequalities expressed in terms of the
weights $w_0$, $w_1$, and $w_2$, prove that the perceptron
cannot solve the following classification problem:
$$ Patterns = \left[\begin{array}{ccc}
2 & 1 & 3 \\ 6 & 3 & 9 \end{array}\right]
\quad\quad\quad\quad\quad\quad
Desired = \left[\begin{array}{ccc} 0 & 1 & 1 \end{array}\right] $$
\noindent
{\bf Answer:} the proof is similar to the proof that perceptrons
cannot do XOR. Assume some set of weights exists that performs the
task correctly. Then:
\begin{center}
\begin{tabular}{ll@{\hspace*{.25in}}l}
1. & $ w_0 + 2\cdot w_1 + 6\cdot w_2 \leq 0 $ & pattern 1 \\
2. & $ w_0 + w_1 + 3\cdot w_2 > 0 $ & pattern 2 \\
3. & $ w_0 + 3\cdot w_1 + 9\cdot w_2 > 0 $ & pattern 3 \\
4. & $ 2\cdot w_0 + 4\cdot w_1 + 12\cdot w_2 > 0 $ & (2) + (3) \\
5. & $ 2\cdot w_0 + 4\cdot w_1 + 12\cdot w_2 \leq 0 $ & 2 $\times$ (1) \\
& \\
\multicolumn{2}{l}{Lines 4 and 5 are mutually contradictory.}
\end{tabular}
\end{center}
{\bf Problem 2.} The proof of the perceptron convergence theorem
states that if $\hat{w}$ is a weight vector that correctly classifies
all patterns, and $\overline{w}^{(\tau)}$ is the weight vector at step
$\tau$ of the algorithm, then $\hat{w}\cdot\overline{w}^{(\tau)}$
increases at each step. Modify the \verb+perceptron+ program to
demonstrate this by displaying the value at each step. Turn in your
source code and a sample run.
{\bf Answer:} We are given the values for {\tt slope} and {\tt yint},
which define the decision boundary. For points in class 1 we have
$$ x_2 - \hbox{\tt slope}\cdot x_1 - \hbox{\tt yint} > 0 $$
The perceptron computes $$w_0 + w_1 x_1 + w_2 x_2 > 0$$
From this last equation we can see that $w_0 = -\hbox{\tt yint}$, $w_1
= -\hbox{\tt slope}$, and $w_2 = 1$. So the correct weight vector
$\hat{w}$ is given by [\hbox{\tt-yint}~\hbox{\tt -slope}~1].
The code on the following page displays both the dot product and the
angle in degrees between the correct weight vector and the current
weight estimate. You will note that the dot product increases with
every iteration, as the Perceptron Convergence Theorem requires. The
angle does not always decrease, but the general trend is downward.
\newpage
\begin{verbatim}
% perceptron - Perceptron demo with random training set.
% Calculate and plot the training set.
NPATS = 10+floor(rand(1)*30);
Patterns = rand(2,NPATS)*2-1;
slope = log(rand(1)*10);
yint = rand(1)*2-1;
Desired = Patterns(2,:) - Patterns(1,:)*slope - yint > 0;
%Calculate the correct weight vector
w_hat = [-yint -slope 1];
PlotPats(Patterns,Desired)
Inputs = [ones(1,NPATS); Patterns];
Weights = [0 0 0];
for i = 1:50
% Calculate and display the dot product
dotprod = dot(Weights,w_hat);
angl = acos(dotprod/max(1e-10,(norm(Weights)*norm(w_hat))))/pi*180;
fprintf('%2d. dot=%7.4f, ang=%7.4f Weights = ',i,dotprod,angl);
disp(Weights);
Result = (Weights * Inputs) > 0;
if Result == Desired, break, end
Weights = Weights + (Desired-Result) * Inputs';
PlotBoundary(Weights,i,0)
pause(1)
end
PlotBoundary(Weights,i,1)
\end{verbatim}
\vspace*{1em}
\begin{verbatim}
>> percepans
1. dot= 0.0000, ang=90.0000 Weights = 0 0 0
2. dot= 6.3510, ang=70.3604 Weights = 8.0000 -1.8914 0.9488
3. dot=17.6129, ang=42.7304 Weights = -6.0000 -8.5063 1.3795
4. dot=18.8872, ang=12.3858 Weights = -1.0000 -7.8111 3.1145
5. dot=20.1615, ang=21.5806 Weights = 4.0000 -7.1158 4.8496
6. dot=25.1815, ang=30.1488 Weights = -5.0000 -11.3636 2.9161
7. dot=26.4557, ang= 5.4394 Weights = 0 -10.6683 4.6512
8. dot=26.6684, ang= 8.4328 Weights = 2.0000 -9.7617 6.3326
9. dot=26.8576, ang= 4.9860 Weights = 1.0000 -10.0758 6.0721
\end{verbatim}
{\bf Problem 3.} Run the \verb+bowl+ demo with learning rates of
0.1, 0.142, and 0.15. Hand in a printout of each run. What can you
say about the model's behavior at each learning rate?
{\bf Answer:} See Figure \ref{fig:prob3} for the results of running
\verb+bowl+ with learing rates of 0.01, 0.1, 0.142, and 0.15. With learning
rate = 0.01, changes in the weights, at each iteration, are small.
Using a learning rate of 0.1, the convergence is much faster
(requiring fewer iterations), but otherwise the behavior is similar.
With a learning rate of 0.142, the sucessive weight vectors oscillate
due to an overcorrecting of the error, but eventually the weights
settle to the optimal solution (bottom of the bowl). The algorithm is
unstable with a learning rate of 0.15; each iteration drives the
weights further away from the optimal solution.
{\bf Problem 4.} Consider the function $f(x,y) = \exp
\left(-(0.6x-0.7)^2-(y-0.4)^2\right)$. (a) Train an LMS network to
approximate this function over the unit square ($0\leq x\leq 1$,
$0\leq y\leq 1$). (b) What is the shape of the approximation
function? (c) By looking at the weights of your trained network, you
can see the first degree polynomial that the neural network has
devised to approximate $f$. Write down this polynomial, and hand it
in along with the code you wrote to solve this problem.
{\bf Answer:}
The approximation function is the plane: $z = 0.3488x-0.1368y+0.6431$. Figure
\ref{fig:prob4} shows the function $f(x,y)$ and its linear
approximation. The code is presented on a subsequent page.
{\bf Problem 5.}
How much information does it take to describe a two-input perceptron?
The classical description uses a vector of three real-valued
parameters: $\vec{w} = \langle w_0, w_1, w_2 \rangle$. But the
perceptron's decision boundary is a line, which can be uniquely
specified with just two parameters, e.g., slope and intercept.
Jack says: ``A perceptron can be described with less information than
three real numbers. Here's how I would do it: set $s_0=w_0/w_2$, and
$s_1=w_1/w_2$. From the description $\langle s_0, s_1 \rangle$, I can
construct a weight vector $\langle s_0, s_1, 1\rangle$ that describes
a line with the same slope and intercept as the one described by
$\vec{w}$. So it should behave the same as $\vec{w}$ for all
inputs.''
Jill replies: ``A perceptron requires more than that to describe. A
line is not an inequality. Consider the case where $w_2$ is negative.
How will that effect the results of your transformation?''
(a) Whose claim is correct, and why? (b) How much information does it
really take to correctly describe a two-input perceptron? (Don't worry
about the case where $w_2=0$.)
{\bf Answer.} Jill's claim is closer to correct. Jack has the right
idea initially when he says that three real numbers is overkill; the
description of the line is overdetermined since $w_0$, $w_1$ and $w_2$
are not independent. But in settling on just two real numbers he is
throwing away too much information, because the vectors $\vec{w}$ and
$-\vec{w}$ have the same representation in Jack's scheme, even though
they produce opposite results. (When $w_2$ is negative, dividing
through by $w_2$ requires us to exchange $\leq$ for $>$ and vice
versa, otherwise the resulting perceptron will misclassify every
training point.) So it would appear to take two real numbers plus a
sign bit to describe a two-input perceptron: two real numbers to
describe the line, as Jack proposes, and one bit to specify which side
of the line is class 1 and which is class 0.
However, there is an alternative solution which can do the job with
two real numbers: the decision boundary can be described by an angle
$\theta$ plus a $y$-intercept. Negating $\vec{w}$ alters $\theta$ by
$180^\circ$.
\begin{figure}
\centering
\begin{tabular}{c c}
\subfigure[Learning rate = 0.01]{\epsfig{file=prob3_01.eps,width=7cm}} &
\subfigure[Learning rate = 0.1]{\epsfig{file=prob3_1.eps,width=7cm}} \\
\subfigure[Learning rate = 0.142]{\epsfig{file=prob3_142.eps,width=7cm}} &
\subfigure[Learning rate = 0.15]{\epsfig{file=prob3_15.eps,width=7cm}}
\end{tabular}
\caption{Printouts for each learning rate for Problem 3.}
\label{fig:prob3}
\end{figure}
\begin{figure}
\centering
\epsfig{file=prob4.eps,width=14cm}
\caption{The function $f(x,y)$ and its linear approximation}
\label{fig:prob4}
\end{figure}
\newpage
\begin{verbatim}
% Answer to homework 1, problem 4.
[x,y] = meshgrid(0 : 0.1 : 1);
z = exp(-(0.6*x-0.7).^2-(y-0.4).^2);
Patterns = [x(:) y(:)]';
Desired = z(:)';
NINPUTS = size(Patterns,1);
NPATS = size(Patterns,2);
NUNITS = size(Desired,1);
Inputs = [ones(1,NPATS); Patterns];
InitWeights = rand(NUNITS,1+NINPUTS) - 0.5;
Weights = InitWeights;
PlotPats3D(Patterns,Desired)
xlabel('x'), ylabel('y'), zlabel('z')
surf(x,y,z)
axis([0 1 0 1 0 1]), view(110,30)
LearnRate = 0.01;
TSS = Inf;
for epoch = 0:2000,
if rem(epoch,5) == 0
PlotBoundarySurface(Weights)
pause(0.25)
end
NetIn = Weights * Inputs;
Result = NetIn;
Error = Result - Desired;
OldTSS = TSS;
TSS = sum(sum(Error.^2));
fprintf('%2d. TSS=%8.5f, Weights =',epoch,TSS);
disp(Weights);
if abs(TSS-OldTSS) < 0.00001
PlotBoundarySurface(Weights)
break;
end;
dW = - (Error * Inputs');
Weights = Weights + LearnRate * dW;
end
\end{verbatim}
\end{document}