function [alpha, lambda] = svm(G, y, C, params)
% function [alpha, lambda] = svm(G, y, C, params)
%
% Trains a support vector machine.
%
% G is the data matrix: if K is the kernel function and x_i is the
% i-th training example, then G(i,j) = K(x_i, x_j).
%
% y is the vector of target outputs, with each entry +1 or -1.
%
% C is the relative weight of error vs. margin (large C means a low
% error is more important, while small C means a wide margin is more
% important).
%
% Last argument, params, is optional. If provided, its fields set
% some parameters:
% params.iters: maximum iterations (default 75)
% params.verbose: print progress messages (default false)
% params.tol: convergence tolerance (default 1e-6)
%
% Returns the learned classifier parameters alpha and lambda. Alpha
% is a vector of weights on the original examples (those with positive
% weights are the support vectors), and lambda is a cutoff. Given a
% new example x, write k_i = y_i * K(x, x_i); then the predicted class
% for x is
%
% sign(alpha' * k - lambda)
%
% For example usage, see the end of the M-file (use 'which svm').
%
% Uses a primal log-barrier algorithm to find parameters alpha and
% lambda which satisfy the optimality conditions
%
% hess * [alpha; lambda] == rhs
%
% where
%
% hess = [H -y; -y' 0];
% rhs = [1 - H*alpha + lambda*y; alpha'*y];
% H = G .* (y*y');
%
% The most expensive step in each iteration of this algorithm is
% solving an (n+1) by (n+1) linear system, where n is the size of the
% training set.
% magic numbers
backoff = .99995;
n = length(y);
if (nargin < 4)
params = struct;
end
if (~isfield(params, 'iters'))
params.iters = 75;
end
if (~isfield(params, 'verbose'))
params.verbose = false;
end
if (~isfield(params, 'tol'))
params.tol = 1e-6;
end
% indices of the first n diagonal elements of an (n+1) by (n+1)
% matrix
mask = speye(n+1);
mask(end) = 0;
dg = find(mask);
% multiply class signs into data matrix (to save typing later, since
% we never use the original G by itself)
H = G .* (y * y');
% We will initialize alpha to the analytic center of its feasible
% region. The analytic center will have the components of alpha
% corresponding to y=1 equal to C*(1-p), and the ones corresponding to
% y=-1 equal to C*p, where p is the fraction of components with y=1.
p = sum(y>0) / n;
alpha = repmat(C * p, [n 1]);
alpha(y>0) = C * (1-p);
% start with a relatively large barrier term (which will be reduced
% during the optimization), and start with the dual variable lambda=0.
mu = 10000;
lambda = 0;
% main optimization loop
for i = 1:params.iters
% Build the Newton system, leaving out the barrier terms (which
% we will add in below after preconditioning)
hess = [H -y; -y' 0];
grad = [1 - H*alpha + lambda*y; alpha'*y];
% We will symmetrically precondition the Newton system by
% multiplying the Hessian on either side with a diagonal matrix. We
% do the preconditioning on the barrier terms analytically and add
% them in last, since otherwise we may get a loss of precision.
precond = [(alpha/C) .* (1-alpha/C); 1];
hess = hess .* (precond * precond');
hess(dg) = hess(dg) + mu * ((C-alpha).^2 + alpha.^2)/C^4;
rhs = grad .* precond + (mu/C) * [1 - 2*alpha/C; 0];
% compute search direction
searchdir = hess \ rhs;
searchdir = searchdir .* precond;
% compute maximum possible step size (to keep alpha in [0,C])
adir = searchdir(1:end-1);
steplen = max(-adir ./ (alpha+1e-15));
steplen = max(steplen, max(adir ./ (C-alpha+1e-15)));
if (steplen <= 0) steplen = Inf; else steplen = 1 / steplen; end
% back off from max step size according to how far it would have
% taken us past the boundary
steplen = min(1, .666 * steplen + (backoff - .666) * steplen^2);
% do the step, pick the next barrier weight mu, and report
alpha = alpha + steplen * adir;
lambda = lambda + steplen * searchdir(end);
oldmu = mu;
update = sum(abs(searchdir));
if (steplen > .995)
mu = mu * .1;
elseif (steplen > .95)
mu = mu * .3334;
end
if (params.verbose)
fprintf('Iteration %3d: step %8.6f, next_mu %8.6f, update %8.6f\n', ...
i, steplen, mu, update);
end
if (oldmu + update < params.tol/2)
break;
end
% avoid getting too close to zero barrier
if (mu < params.tol / 10)
mu = params.tol / 10;
end
end
if (oldmu + update > params.tol)
warning('SVM:convergence', 'failed to converge');
end
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Example usage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% generate some training data: inputs are random points in the
% interval [-2,2], target concept is "in [-1,1]", target outputs are
% noisy realizations of the target concept.
noise = 0.3;
xs = sort(rand(100,1)*4-2);
y = (xs.^2 - 1 + noise*randn(size(xs))) < 0;
y = -1 + 2*y;
plot(xs,y,'x')
% learn a quadratic SVM
G = (1 + xs * xs') .^2;
[alpha, lambda] = svm(G, y, 3, struct('verbose', true));
% extract the support vectors and their weights
mask = alpha>1e-5;
sv = xs(mask);
sw = alpha(mask) .* y(mask);
% build and plot the learned quadratic classifier
a = sum(sw.*(sv.^2));
b = sum(2*sv.*sw);
c = sum(sw);
plot(xs, c + b*xs + a*xs.^2, 'x', xs, lambda*ones(size(xs)))