\documentclass[12pt]{article}
\usepackage{scribe}
\Scribe{Ryan O'Donnell} \Lecturer{Ryan O'Donnell} \LectureNumber{2}
\LectureDate{Jan.~18, 2005} \LectureTitle{Linearity and the Fourier
Expansion}
\begin{document}
\MakeScribeTop
\section{Linearity}
What does it mean for a boolean function to be \emph{linear}? For
the question to make sense, we must have a notion of adding two
binary strings. So for now, when we think of boolean functions $f :
\{0,1\}^n \to \{0,1\}$ we will treat $\{0,1\}$ as the field $\F_2$.
This just means that $0 + 0 = 1 + 1 = 0$, $0 + 1 = 1 + 0 = 1$, $1
\cdot 1 = 1$, etc.
Now there are two well-known classical notions of being linear:
\begin{definition}\
(1)~$f$ is \emph{linear} iff $f(x+y) = f(x) + f(y)$ for all $x,y \in
\{0,1\}^n$.
(2)~$f$ is \emph{linear} iff there are some $a_1, \dots, a_n \in
\F_2$ such that $f(x_1, \dots, x_n) = a_1 x_1 + \cdots + a_n x_n$
\indent \qquad \qquad \quad\ \ $\Leftrightarrow$ there is some $S
\subseteq [n]$ such that $f(x) = \sum_{i \in S} x_i$.
\end{definition}
\noindent (Sometimes in (2) one allows an additive constant; we
won't, calling such functions \emph{affine}.)\\
Since these definitions sound equally good we may hope that they're
equivalent; happily, they are. Now $(2) \Rightarrow (1)$ is easy:
\[
\boldsymbol{(2) \Rightarrow (1):} \quad f(x + y) = \littlesumx_{i
\in S} (x+y)_i = \littlesumx_{i \in S} x_i + \littlesumx_{i \in S}
y_i = f(x) + f(y).
\]
But $(1) \Rightarrow (2)$ is a bit more interesting. The easiest
proof:
\begin{center}
\parbox{5in}{
$\boldsymbol{(1) \Rightarrow (2):}$ \quad Define $\alpha_i =
f(\overbrace{0, \dots, 0, 1, 0, \dots, 0}^{e^i})$. Now repeated use
of condition 1 implies $f(x^1 + x^2 + \cdots + x^n) = f(x^1) +
\cdots + f(x^n)$, so indeed
\[
f((x_1, \dots, x_n)) =
f(\littlesum x_i e^i) = \littlesum x_i f(e^i) = \littlesum \alpha_i
x_i.
\]
}
\end{center}
\subsection{Approximate Linearity}
Nothing in this world is perfect, so let's ask: What does it mean
for $f$ to be \emph{approximately linear}? Here are the natural
first two ideas:
\begin{definition}\
(1$'$)~$f$ is \emph{approximately linear} if $f(x+y) = f(x) + f(y)$
for \emph{most} pairs $x,y \in \{0,1\}^n$.
(2$'$)~$f$ is \emph{approximately linear} if there is some $S
\subseteq [n]$ such that $f(x) = \sum_{i \in S} x_i$ for \emph{most}
$x \in \{0,1\}^n$.
\end{definition}
Are these two equivalent? It's easy to see that $(2') \Rightarrow
(1')$ still essentially holds: If $f$ has the right value for both
$x$ and $y$ (which happens for most pairs), the equation in the $(2)
\Rightarrow (1)$ proof holds up.\\
The reverse implication is not clear: Take any linear function and
mess up its values on $e^1, \dots, e^n$. Now $f(x + y) = f(x) +
f(y)$ still holds whenever $x$ and $y$ are not $e^i$'s, which is
true for almost all pairs. But now the equation in the $(1)
\Rightarrow (2)$ proof is going to be wrong for very many $x$'s. So
this proof doesn't work --- but actually our $f$ \emph{does} satisfy
$(2')$, so maybe a different proof will work.\\
We will investigate this shortly, but let's first decide on $(2')$
as our official definition:
\begin{definition} $f, g : \{0,1\}^n \to \{0,1\}$ are
\emph{$\eps$-close} if they agree on at least a $(1-\eps)$-fraction
of the inputs $\{0,1\}^n$. Otherwise they are \emph{$\eps$-far}.
\end{definition}
\begin{definition} $f$ is \emph{$\eps$-close to having property
$\calP$} if there is some $g$ with property $\calP$ such that $f$
and $g$ are $\eps$-close.
\end{definition}
A ``property'' here can really just be any collection of functions.
For our current discussion, $\calP$ is the set of $2^n$ linear
functions.\\
\subsection{Testing Linearity}
Given that we've settled on definition $(2')$, why worry about
definition $(1')$? Imagine someone hands you some black-box
software $f$ that is supposed to compute \emph{some} linear
function, and your job is to test it --- i.e., try to identify bugs.
You can't be sure $f$ is perfect unless you ``query'' its value
$2^n$ times, but perhaps you can become convinced $f$ is
$\eps$-close to being linear with many fewer queries.\\
If you knew \emph{which} linear function $f$ was supposed to be
close to, you could just check it on $O(1/\eps)$ many random values
--- if you found no mistakes, you'd be quite convinced $f$ was
$\eps$-close to linear.\\
Now if you just look at definition $(2')$, you might think that all
you can do is make $n$ linearly independent queries to first
determine which linear function $f$ is supposed to be, and then do
the above. (We imagine that $n \gg 1/\eps$.) But it's kind of
silly to use complexity $n$ to ``test'' a program that can itself be
implemented with complexity $n$. But if $(1') \Rightarrow (2')$, it
would give a way to give a much more efficient test. This was
suggested and proved by M.~Blum, Luby, and Rubinfeld in 1990:
\begin{definition} The ``BLR Test'': Given an unknown $f : \{0,1\}^n \to
\{0,1\}$:
\begin{itemize}
\item Pick $\x$ and $\y$ independently and uniformly at random from
$\{0,1\}^n$.
\item Set $\z = \x + \y$.
\item Query $f$ on $\x$, $\y$, and $\z$.
\item ``Accept'' iff $f(\z) = f(\x) + f(\y)$.
\end{itemize}
\end{definition}
Clearly:
\begin{fact} If $f : \{0,1\}^n \to \{0,1\}$ is a linear function,
the probability that the BLR Test passes is $1$.
\end{fact}
In this lecture we will try to prove:
\begin{theorem} \label{thm:blr} Suppose $f$ passes the BLR Test with probability at
least $1 - \eps$. Then $f$ is $\eps$-close to being linear.
\end{theorem}
We'll finish the proof in the next lecture.\\
Given this theorem, suppose we do the BLR test $O(1/\eps)$ times.
If it never fails, we can be quite sure the true probability $f$
passes the test is at least $1 - \eps$ and thus that $f$ is
$\eps$-close to
being linear.\\
NB: BLR originally proved a slightly weaker result than
Theorem~\ref{thm:blr} (they lost a constant factor). We present the
'95 proof due to Bellare, Coppersmith, H{\aa}stad, Kiwi, and Sudan.
\section{The Fourier Expansion}
Suppose $f$ passes the BLR test with high probability. We want to
try showing that $f$ is $\eps$-close to some linear function. But
which one should we pick?\\
There's a trick answer to this question: We should pick the closest
one! But given $f : \{0,1\}^n \to \{0,1\}$, how can we decide which
linear function $f$ is closest to?\\
Imagine we stack the $2^n$ values of $f(x)$ in, say, lexicographical
order, and treat them as a vector in $2^n$-dimensional space,
$\R^{2^n}$:
\[
f = \left[\begin{array}{c} 0 \\ 1 \\
1 \\ 1\\ 0 \\ \vdots \\ 1\end{array}\right].
\]
\bigskip
\noindent Do the same for all $2^n$ linear (Parity) functions:
\[
\chi_\emptyset = \left[\begin{array}{c} 0 \\ 0 \\
0 \\ 0\\ 0 \\ \vdots \\ 0\end{array}\right], \chi_{\{1\}} = \left[\begin{array}{c} 0 \\ 1 \\
0 \\ 1\\ 0 \\ \vdots \\ 1\end{array}\right], \dots, \chi_{[n]} = \left[\begin{array}{c} 0 \\ 1 \\
1 \\ 0\\ 1 \\ \vdots \\ \ \end{array}\right]
\]
\begin{notation} $\chi_S$ is Parity on the coordinates in
set $S$. We also write $[n] = \{1, 2, \dots, n\}$.
\end{notation}
Now it's easy to see that the closest Parity to $f$ is the
physically closest vector.
\begin{figure}[h]
\begin{center}
\includegraphics[angle = 270, trim = 1in 0in 1.5in 0in, clip, width =
3in]{vects.eps}\\
$f$ is closest to $\chi_{S_1}$
\end{center}
\end{figure}
It's extra-convenient if we replace $0$ and $1$ with $1$ and $-1$;
then the \emph{dot product} of two vectors measures their closeness
(the bigger the dot product, the closer). This motivates the Great
Notational Switch we'll use 99\% of the time.
\bigskip
\begin{center}
{\bf Great Notational Switch:} \qquad 0/False $\rightarrow +1$,
\quad 1/True $\rightarrow -1$.
\end{center}
\noindent We think of $+1$ and $-1$ here as \emph{real numbers}. In
particular, we now have:
\begin{center}
Addition (mod 2) $\rightarrow$ Multiplication (in $\R$).
\end{center}
\noindent Under this notation, a generic boolean function is written
$\fisafunc$, and the Parity-on-bits-$S$ function is $\chi_S : \bn
\to \bits$ given by
\[
\chi_S(x) = \prod_{i \in S} x_i.
\]
We now have:
\begin{fact} The dot product of $f$ and $\chi_S$, as vectors in
$\bits^{2^n}$, equals
\[
\text{(\# $x$'s such that $f(x) = \chi_S(x)$)} - \text{(\# $x$'s
such that $f(x) \neq \chi_S(x)$)}.
\]
\end{fact}
\begin{definition} For any $f, g : \bn \to \R$, we write
\begin{eqnarray*}
\la f, g\ra &=& \frac{1}{2^n}(\text{dot product of $f$ and $g$ as
vectors}) \\
& = & \avg_{\x \in \bn} [f(\x)g(\x)] = \Ex_{\x \in \bn}[f(\x)g(\x)].
\end{eqnarray*}
We also call this the \emph{correlation} of $f$ and
$g$\footnote{This doesn't agree with the technical definition of
correlation in probability, but never mind.}.
\end{definition}
\begin{remark} $\la \cdot , \cdot \ra$ is \emph{linear} in its
arguments; i.e., $\la f + g, h \ra = \la f, h \ra + \la g, h \ra$,
etc. This is just because dot-product has the same property: $(u +
v) \cdot w = u \cdot w + v \cdot w$, etc.
\end{remark}
\begin{fact}
If $f$ and $g$ are boolean-valued, $f, g : \bn \to \bits$, then $\la
f, g \ra \in [-1,1]$. Further, $f$ and $g$ are $\eps$-close iff
$\la f, g \ra \geq 1 - 2\eps$.
\end{fact}
Now in our linearity testing problem, given $\fisafunc$ we are
interested in the Parity function having maximum correlation with
$f$. Let's give notation for these correlations:
\begin{notation} For $S \subseteq [n]$, we write
\[
\hat{f}(S) = \la f, \chi_S \ra
\]
\end{notation}
Now with the switch to $-1$ and $1$, something interesting happens
with the $2^n$ Parity functions; they become orthogonal vectors:
\begin{proposition} If $S \neq T$ then $\chi_S$ and $\chi_T$ are orthogonal; i.e., $\la \chi_S, \chi_T \ra =
0$.
\end{proposition}
\begin{proof}
Let $i \in S \Delta T$ (the symmetric difference of these sets);
without loss of generality, say $i \in S \setminus T$. Pair up all
$n$-bit strings: $(x, x^{(i)})$, where $x^{(i)}$ denotes $x$ with
the $i$th bit flipped.
Now the vectors $\chi_S$ and $\chi_T$ look like this on
``coordinates'' $x$ and $x^{(i)}$
\[
\chi_S = [ \qquad \qquad a \qquad -a \qquad \qquad ]
\]
\[
\chi_T = [ \qquad \qquad b \qquad \ \ \ \ \,b \qquad \qquad ]
\]
\[
\hspace{.8in} \nwarrow x \qquad \nwarrow x^{(i)}
\]
for some bits $a$ and $b$. In the inner product, these coordinates
contribute $ab - ab = 0$. Since we can pair up all coordinates like
this, the overall inner product is $0$.
\end{proof}
\begin{corollary}
The set of $2^n$ vectors $(\chi_S)_{S \subseteq [n]}$ form an
\emph{complete orthogonal basis} for $\R^{2^n}$.
\end{corollary}
\begin{proof}
We have $2^n$ mutually orthogonal nonzero vectors in a space of
dimension $2^n$.
\end{proof}
\begin{fact} If $\fisafunc$, $\text{``}\|f\|\text{''} =
\sqrt{\la f, f \ra} = 1$.
\end{fact}
\begin{corollary} The functions $(\chi_S)_{S \subseteq [n]}$ form an \emph{orthonormal
basis} for $\R^{2^n}$.
\end{corollary}
\noindent In other words, these Parity vectors are just a rotation
of the standard basis.
As a consequence, the most basic linear algebra implies that every
vector in $\R^{2^n}$ --- in particular, any $\fisafunc$ --- can be
written uniquely as a linear combination of these vectors:
\[
f = \sum_{S \subseteq [n]} c_S \chi_S \qquad \text{as vectors, for
some $c_S \in \R$}.
\]
Further, the coefficient on $\chi_S$ is just the length of the
projection; i.e., $\la f, \chi_S\ra$:
\[
(\hat{f}(T) = )\quad \la f, \chi_T \ra = \la \littlesumx_S c_S
\chi_S, \chi_T \ra = \littlesumx_S c_S \la \chi_S, \chi_T \ra = c_T.
\]
I.e., we've shown:
\begin{theorem} Every function $f : \bn \to \R$ --- in particular,
every boolean-valued function $f : \bn \to \bits$ --- is uniquely
expressible as a linear combination (over $\R$) of the $2^n$ Parity
functions:
\begin{equation} \label{eqn:fourier}
f = \sum_{S \subseteq [n]} \hat{f}(S) \chi_S.
\end{equation}
(This is a pointwise equality of functions on $\bn$.)
The real numbers $\hat{f}(S)$ are called the \emph{Fourier
coefficients} of $f$, and~\eqref{eqn:fourier} the \emph{Fourier
expansion} of $f$.
\end{theorem}
Recall that for boolean-valued functions $\fisafunc$, $\hat{f}(S)$
is a number in $[-1,1]$ measuring the correlation of $f$ with the
function Parity-on-$S$. In~\eqref{eqn:fourier} we have the property
that for every string $x$, the $2^n$ real numbers
$\hat{f}(S)\chi_S(x)$ ``magically'' always add up to a number that
is either $-1$ or $1$.
\subsection{Examples}
Here are some example functions and their Fourier transforms. In
the Fourier expansions, we will write $\prod_{i\in S}$ in place of
$\chi_S$. Recall that the explicit way to do these calculations is
via
\[
\hat{f}(S) = \Ex_{\x \in \bn}\bigl[f(x) \littleprodx_{i \in S}
x_i\bigr].
\]
\bigskip
\begin{center}
\begin{tabular}{c|c}
$f$ & Fourier transform\\
\hline\\
$f(x) = 1$ & $1$\\
&\\
$f(x) = x_i$ & $x_i$ \\
&\\
AND$(x_1, x_2)$ & $\half + \half x_1 + \half x_2 -
\half x_1 x_2$\\
& (i.e., $\hat{f}(\emptyset) = \frac12$, $\hat{f}(\{1\}) = \frac12$,
$\hat{f}(\{2\}) = \frac12$, $\hat{f}(\{1,2\}) =
-\frac12$)\\
&\\
MAJ$(x_1,x_2,x_3)$ & $\half x_1 + \half x_2 + \half x_3 - \half
x_1x_2x_3$\\
&\\
$f : $ \begin{tabular}{c|c}
$+++$ & $+$ \\
$++-$ & $-$ \\
$+-+$ & $+$ \\
$+--$ & $+$ \\
$-++$ & $-$ \\
$-+-$ & $-$ \\
$--+$ & $-$ \\
$---$ & $-$
\end{tabular}\ \ \ \ \ \ \ &
\begin{tabular}{c}
$\hat{f}(\emptyset) = -\frac14$ \\
$\hat{f}(\{1\}) = +\frac34$ \\
$\hat{f}(\{2\}) = -\frac14$ \\
$\hat{f}(\{3\}) = +\frac14$ \\
$\hat{f}(\{1,2\}) = -\frac14$ \\
$\hat{f}(\{1,3\}) = +\frac14$ \\
$\hat{f}(\{2,3\}) = +\frac14$ \\
$\hat{f}(\{1,2,3\}) = +\frac14$
\end{tabular} \\
& $f(x) = -\frac14 + \frac34 x_1 - \frac14 x_2 + \frac14 x_3 -
\frac14 x_1x_2 + \frac14 x_1x_3 +\frac14 x_2x_3 +\frac14 x_1x_2x_3$
\end{tabular}
\end{center}
\subsection{Parseval, Plancherel}
We will now prove one of the most important, basic facts about
Fourier transforms:
\begin{theorem} (``Plancherel's Theorem'') Let $f, g : \bn \to \R$.
Then
\[
\la f, g \ra = \Ex_{\x \in \bn}[f(\x)g(\x)] = \sum_{S \subseteq [n]}
\hat{f}(S) \hat{g}(S).
\]
\end{theorem}
This just says that when you express two vectors in an orthonormal
basis, their inner product is equal to the sum of the products of
the coefficients.
\begin{proof}
\begin{eqnarray*}
\la f, g \ra & = & \left\la \sum_{S \subseteq [n]} \hat{f}(S)
\chi_S, \sum_{T \subseteq [n]} \hat{g}(T) \chi_T\right\ra
\\
& = & \sum_{S}\sum_{T} \hat{f}(S)\hat{g}(T) \la \chi_S, \chi_T \ra
\qquad \text{(by
linearity of inner product)} \\
& = & \sum_{S} \hat{f}(S) \hat{g}(S)
\qquad\qquad\qquad\quad\text{(by orthonormality of $\chi$'s).}
\end{eqnarray*}
\end{proof}
\begin{corollary} (``Parseval's Theorem'') Let $f : \bn \to \R$.
Then
\[
\la f, f \ra = \Ex_{\x \in \bn}[f(\x)^2] = \sum_{S \subseteq[n]}
\hat{f}(S)^2.
\]
\end{corollary}
This just says that the squared length of a vector, when expressed
in an orthonormal basis, equals the sum of the squares of the
coefficients. In other words, it's the Pythagorean Theorem.\\
One very important special case:
\begin{corollary} If $\fisafunc$ is a boolean-valued function,
\[
\sum_{S \subseteq [n]} \hat{f}(S)^2 = 1.
\]
\end{corollary}
%\bibliographystyle{abbrv} \bibliography{mybib}
\end{document}