\documentclass[12pt]{article}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{epsfig, graphics}
\usepackage{latexsym}
\usepackage{fullpage}
\usepackage[parfill]{parskip}
\usepackage{mysymbols}
\title{10708 Graphical Models: Homework 2\\
{\small Due October 15th, beginning of class}
}
\date{October 1, 2008}
\begin{document}
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\bf Instructions}: There are six questions on this assignment. Each
question has the name of one of the TAs beside it, to whom you
should direct any inquiries regarding the question. Please submit
your homework in two parts, one for each TA. Also, please put each
TA's questions in the same order they appeared in the homework. The
last problem involves coding, which should be done in MATLAB. Do
{\it not} attach your code to the writeup. Instead, copy your
implementation to
\begin{center}
\begin{verbatim}
/afs/andrew.cmu.edu/course/10/708/Submit/your_andrew_id/HW2
\end{verbatim}
\end{center}
Refer to the web page for policies regarding collaboration, due dates, and extensions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Q1
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{I-equivalence {\small [20 pts] [Amr]}}
Let $\mathcal{G}_1$ and $\mathcal{G}_2$ be two graphs over $\mathcal{X}$. In this question
we will explore when $\mathcal{G}_1$ and $\mathcal{G}_2$ are I-equivalent.
\begin{enumerate}
\item \text{[3 pts]} Prove that two network structures ${\cal G}_1$ and ${\cal G}_2$ are I-equivalent if the
following two conditions hold:
\begin{enumerate}
\item The two graphs have the same set of trails, and
\item A trail is active in ${\cal G}_1$ iff it is active in ${\cal G}_2$.
\end{enumerate} (\textit{Hint:} Use the notion of d-separation.)
\label{item:iequiv}
\item \text{[3 pts]} Prove that if ${\cal G}_1$ and ${\cal G}_2$ have the same skeleton and the same set of v-structures then they are I-equivalent.(\textit{Hint:} use the result from part \ref{item:iequiv})
\label{item:2}
\item \text{[2 pts]} Can part \ref{item:2} be extended to an if and only if statement? If
so, prove the other direction. If not, provide an example of two
I-equivalent graphs $\mathcal{G}_1$ and $\mathcal{G}_2$ that have
the same skeleton, but different v-structures.
\end{enumerate}
Your answers to the above questions should convince you that same v-structures, although
sufficient, are not
necessary for I-equivalence. In the following parts, you will provide a condition that precisely
relates I-equivalence and similarity of network structures. We begin with a few definitions
you will need:
\begin{defn}[Minimal Active Trail]
\label{def:min-at} Consider an active trail $T=X_1, X_2, \ldots,
X_m$. We call this active trail \emph{minimal} if no subset of the
nodes in $T$ forms an active trail between $X_1$ and $X_m$. In other
words, $T$ is minimal if no other active trail between $X_1$ and
$X_m$ ``shortcuts" any of the nodes in $T$.
\end{defn}
\begin{defn}[Triangle]
\label{def:tri}
Consider a trail $T=X_1, X_2, \ldots, X_m$. We call
any three consecutive nodes in the trail a \emph{triangle} if their
undirected skeleton is fully connected (i.e., forms a 3-clique). In
other words, $X_{i-1}, X_i, X_{i+1}$ form a triangle if we have
$X_{i-1} \rightleftharpoons X_i \rightleftharpoons X_{i+1}$ and
$X_{i-1} \rightleftharpoons X_{i+1}$.
\end{defn}
\begin{enumerate}
\setcounter{enumi}{3}
\item \text{[3 pts]} Prove that the only possible triangle in a minimal active trail is
one where $X_{i-1} \leftarrow X_i \rightarrow X_{i+1}$, with an edge
between $X_{i-1}$ and $X_{i+1}$, and where either $X_{i-1}$ or
$X_{i+1}$ is the center of a v-structure in the trail.
(\textit{Hint:} prove by cases.)
\item \text{[4 pts]} Consider two networks $\mathcal{G}_1$ and $\mathcal{G}_2$ that have
the same skeleton and same immoralities. Prove, using the notion of
minimal active trail, that $\mathcal{G}_1$ and $\mathcal{G}_2$ imply
precisely the same conditional independence assumptions, i.e., that
if $X$ and $Y$ are d-separated given $\mathbf{Z}$ in
$\mathcal{G}_1$, then $X$ and $Y$ are also d-separated given
$\mathbf{Z}$ in $\mathcal{G}_2$. (\textit{Hint:} prove by
contradiction.)
%(\textit{Hint:} prove by contradiction.)
\item \text{[5 pts]} Finally, prove that two networks $\mathcal{G}_1$ and $\mathcal{G}_2$
that induce the same conditional independence assumptions must have
the same skeleton and the same immoralities. (\textit{Hint:} prove by
contradiction.)
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Q2
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{P-MAPS, minimal I-maps and PDAGs {\small [10 points] [Amr]}}
In this question we will analyze what happens when you apply the
PDAG learning algorithm you implemented in HW1 on a distribution
that does not have a P-map. In this problem, use $d = n-2$.
Consider a probability distribution $\cal P$ over 4 variables,
$(X_1,Y_1,X_2,Y_2)$, that entails the following and only the
following independence assertions: $X_1 \perp X_2 | Y_1,Y_2$ and
$Y_1 \perp Y_2 | X_1,X_2$. From class we know that this distribution
does not have a P-map.
\ben
\item \text{[1 pts]}
Draw the skeleton and final P-DAG resulting from applying the PDAG
learning algorithm, from HW1, using the above independence
assertions.
\item \text{[2 pts]}
Given a P-DAG, we can obtain a DAG (i.e. a Bayesian network)
consistent with it by repeating the following two steps until all
edges are directed: 1) Randomly directing an undirected edge and 2)
Propagating the constraints enforced by the new directed edge to
avoid creating extra immoralities or cycles. How many DAGs can you
obtain from the skeleton learnt in part 1, if any? Either enumerate
all of them or explain why you can not obtain any DAG.
\item \text{[2 pts]}
While $\cal P$ does not have a P-map, it still has minimal
I-map(s). Draw the minimal I-map for $\cal P$ using each of the
following orders of adding variables to the graph:
\begin{itemize}
\item $X_1, Y_1, Y_2, X_2$
\item $X_1, Y_1, X_2, Y_2$
\end{itemize}
Are these minimal I-maps I-equivalent, and why?
\item \text{[1 pts]}
What is the relationship between the skeleton obtained in 2.1
and the underlying skeletons of the DAGs obtained in 2.3?
\item \text{[4 pts]}
Prove the following statement or provide a counter example: an edge $W-V$ will appear in the skeleton produced by
\emph{build-skeleton} if and only if $W \leftrightharpoons V$
appears in all minimal I-maps of $\cal P$. (\emph{Hint}: each
minimal I-map is associated with an ordering of adding variables to
the graph)
\een
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Q3
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Decomposable Scores {\small [10 pts] [Dhruv]}}
Decomposable scoring functions are those where the score of a network given data ${\cal D}$ can be represented as the sum of scores of each node given its parents and the data:
\begin{align*}
\textrm{score}({\cal G} : {\cal D}) = \sum_i \textrm{FamScore}(X_i | {\bf Pa}_i^{\cal G} : {\cal D})
\end{align*}
In greedy structure search we explore the space of structures by applying a local operator to an existing Bayes net. Examples of local operators include adding an edge, deleting an edge, and reversing an edge. In this question you will show that if the scoring function is decomposable, then computing the change in score caused by a local operator can be computed efficiently.
\begin{enumerate}
\item \text{[5 pts]} Prove proposition 17.4.6 (Koller \& Friedman )
\item \text{[5 pts]} Prove proposition 17.4.7 (Koller \& Friedman )
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Q4
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Learning Edge Directions {\small [15 pts] [Amr]}}
In this question, we consider a simpler form of structure learning for
BNs: Assume we have a skeleton and want to build a BN from it. For each
edge, we want to either assign a direction to this edge or delete it
from the graph. For this problem, you can assume you are using some
decomposable score, $\textrm{FamScore}(X_i|{\bf Pa}_{X_i})$.
\ben
\item \text{[2 pts]}
Consider the skeleton $X_1--X_2--X_3$, what are the possible BNs that we are
considering in this problem? What is the score of each of the graphs?
\item \text{[3 pts]}
Now, consider the skeleton $X_1--X_2--X_3--X_4$. Does the decision
about the edge $X_1--X_2$ affect the family score of $X_3$? Justify your
answer.
\item \text{[10 pts]}
Using the intuitions above, design a linear time dynamic programming
algorithm for finding the optimal BN from a chain skeleton
$X_1--X_2--X_3--\cdots--X_n$. \een
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Q5
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Greedy Structure Search {\small [10 pts] [Amr]}}
Suppose we have a general network structure search algorithm, $A$, that takes a set of basic
operators on network structures as a parameter. This set of operators
defines the search space for $A$, as it defines the candidate network
structures that are the ``immediate successors'' of any current candidate
network structure, $i.e.$, the successor states of
any state reached in the search. Thus, for example, if the set of operators is
$[$add an edge not currently in the network$]$, then the successor states of any
candidate network $\mathcal{G}$ is the set of structures obtained by adding
a single edge anywhere in $\mathcal{G}$ (so long as acyclicity is maintained).
\\
Given a set of operators, $A$ does a simple greedy search over the set of
network structures, starting from the empty network (no edges), using the
$BIC$ scoring function. Now, consider two sets of operators we can use in $A$. Let
$A_{[add]}$ be $A$ using the set of operations $[$add an edge not currently in
the network$]$, and let $A_{[add,delete]}$ be $A$ using the set of operations $[$add
an edge not currently in the network, delete an edge currently in the
network$]$.
\ben
\item \text{[5 pts]}
Show a distribution where, regardless of the amount of data in our
training set ($i.e.$, even with infinitely many samples), the answer
produced by $A_{[add]}$ is worse ($i.e.$, has a lower BIC score) than the
answer produced by $A_{[add,delete]}$. (It's easiest to
represent your true distribution in the form of a Bayesian network, $i.e.$,
a network from which sample data is generated.)
\item \text{[5 pts]}
Show a distribution where, regardless of the amount of data in our training set,
$A_{[add,delete]}$ will converge to a local maximum. In other words, the answer
returned by the algorithm has a lower score than the optimal (highest-scoring) network.
What can we conclude about the ability of our algorithm to find the optimal
structure?
\een
\textbf{Note:} In 5.1 and 5.2, we are looking for \emph{a full
specification} of each distribution in terms of the graph structure
and the local CPTs. Please also use your \emph{fully} specified
distributions to justify the behavior in 5.1 and 5.2.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Q6
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Tree-Augmented Na\"{i}ve Bayes {\small [35 pts] [Dhruv]}}
In many classification tasks na\"{i}ve Bayes is either competitive
with, or is, the best method, even though, na\"{i}ve Bayes ignores
dependencies between features. This seems to be an argument against
structure learning, until one realizes that most structure learning
methods are trying to model the joint distribution, which does not
necessarily corresponds to a good estimate of the class-conditional
distribution.
Tree-Augmented Na\"{i}ve Bayes (TAN) is a model augments na\"{i}ve
Bayes by adding correlations between features such that each feature
has the class node, and one other feature, as parents.
Figure~\ref{fig:tan} is an example of a TAN model. The advantage of TAN over
NB is that
NB assumes full conditional independence for the features, which can
hurt in some cases by double counting information. A TAN model can
reduce some double counting problems by modeling correlation
between features.
\begin{figure}[!h]
\begin{center}
\includegraphics[bb= 0 0 700 500,scale=0.17]{nb.pdf}
\caption{An example of na\"{i}ve Bayes.}
\label{fig:nb}
\includegraphics[bb=0 0 300 200,width=2in]{tan.pdf}
\caption{An example of tree-augmented na\"{i}ve Bayes. Note that the
induced graph on the evidence variables $(w,x,y,z)$ forms a tree.}
\label{fig:tan}
\end{center}
\end{figure}
The algorithm for learning TAN models is a variant of the Chow-Liu algorithm for learning tree-structured Bayes nets. Let $C$ represent the class variable, and $\{X_i\}_{i=1}^{n}$ be the features (non-class variables).
\begin{enumerate}
\item Compute the conditional mutual information given $C$ between each pair of distinct variables, $$I(X_i;X_j | C) = \sum_{x_i,x_j,c}\tilde{P}(x_i,x_j,c)\log\frac{\tilde{P}(x_i,x_j|c)}{\tilde{P}(x_i|c)\tilde{P}(x_j|c)}$$ where $\tilde{P}(\cdot)$ is an empirical distribution (computed using the training data). Intuitively, this quantity represents the gain in information of adding $X_i$ as a parent of $X_j$ given that $C$ is already a parent of $X_j$.
\item Build a complete undirected graph on the features $X_1,\ldots,X_n$ where the weight of the edge between $X_i$ and $X_j$ is $I(X_i;X_j|C)$. Call this graph ${\cal G}_F$.
\item Find a maximum weighted spanning tree\footnote{Kruskal's or Prim's algorithm can be used to find a maximum weighted spanning tree. It is okay to use external implementations, but only for finding the maximum spanning tree.} on ${\cal G}_F$. Call it ${\cal T}_F$.
\item Pick an arbitrary node in ${\cal T}_F$ as the root, and set the direction of all the edges in ${\cal T}_F$ to be outward from the root. Call the directed tree ${\cal T}'_F$. (\textit{Hint:} Use DFS).
\item The structure of the TAN model consists of a na\"{i}ve Bayes model on $C,X_1,\ldots,X_n$ augmented by the edges in ${\cal T}'_F$.
\end{enumerate}
The task is to learn a boolean function, classifying input features as class 0 or 1. The data
can be found in \texttt{corral.mat}, where each row is an instance containing six boolean features,
and the last column is the class. The data is generated in \texttt{gen\_corral\_data.m}, if you
are interested in knowing the true mapping.
\subsection{Structure Learning {\small [20 pts]}}
\label{sec:tan:struct}
Implement the above algorithm for learning the structure of a TAN model, and submit your code as \texttt{tanstruct.m}. Using the corral data, draw the structure (directed acyclic graph) produced using this algorithm in your writeup.
\subsection{Representation}
In this part you will implement the representation of a general Bayesian network. Using this representation, you will learn parameters of this model, and perform a simple inference
step\footnote{Do not implement variable elimination.} to perform classification in the next question.
{\it Hint}: The steps you should take in implementing this are as follows:
\begin{enumerate}
\item A data structure to represent a {\it factor}, a mapping from an assignment of variables to a real value. Conditional probability tables can be viewed as factors. For example, in figure~\ref{fig:nb}, the conditional probability table for $W$ would map the assignment ($\textrm{W = 1}, \; \textrm{C = 1}$) to some value $c$. The easiest way to encode a factor is as a multidimensional array where each dimension corresponds to a variable. See \texttt{table\_factor.m}.
\item A data structure to represent a Bayesian network. The easiest way to do this is just to store a list of all the conditional probability tables as factors.
\item A data structure that represent an assignment to variables. The easiest way to do this is as a pair of vectors: $vars$ and $vals$. Note that $vals(i)$ is the value assigned to variable $vars(i)$. See \texttt{assignment.m}.
\item A function (\texttt{assign\_prob.m}) that takes a Bayesian network and an assignment to all the variables, and returns the probability of that assignment.
\end{enumerate}
{\it Note}: You will not receive full marks for an implementation that stores the full joint distribution explicitly.
{\it Note 2}: There is nothing to report in this part. You will use (and upload) this code as part of
the next question.
\subsection{Learning and Classification {\small [15 pts]}}
In this question you will compare the classification accuracy of na\"{i}ve Bayes and TAN. Perform
a variant of leave-one-out cross-validation (LOOCV), that is, hold out one instance,
learn the structure and parameters from a random subset (of size $m$) of the rest,
and then classify this datapoint.
\begin{enumerate}
\item Learn the structure of a TAN model and estimate the parameters using the following smoothing estimator. For the parameter corresponding to $P(x_i | {\bf Pa}_i)$ estimate it using
\begin{align*}
\theta_{x_i | {\bf Pa}_i} &= \alpha \tilde{P}(x_i|{\bf Pa}_i) + (1 - \alpha) \tilde{P}(x_i) \\
\alpha &= \frac{m\tilde{P}({\bf Pa}_i)}{m\tilde{P}({\bf Pa}_i) + s}
\end{align*}
where $s$ is a smoothing parmeter. For this question, use $s = 5$. This is known as back-off smoothing.
\item Learn a na\"{i}ve Bayes model and estimate the parameters using back-off smoothing.
\item Compare the classification accuracy of na\"{i}ve Bayes and TAN on the test set.
{\it Note}: In order to
classify, you will need to compute $P(C | X_1, X_2, ..., X_n)$ where \{$X_i$\} are the features. We will
develop machinery for general inference queries in the next homework. For this question, you should
use Bayes rule, and \texttt{assign\_prob.m} to get the answer to this specific query. Do NOT implement
any general inference algorithm like variable elimination.
\end{enumerate}
Plot the classification accuracy (for the two models) as a function of $m$, the size of the random subset
of training data (vary $m$ from 10 -- 63). Submit the code used to run these experiments as \texttt{tancompare.m}. Comment on the trend observed in the plot. Specifically, do you expect TAN to always (meaning, on all datasets) outperform NB?
\end{document}