function theta = f(features, observed, inittheta)
% theta = expfam(features, observed, inittheta)
%
% Fit a discrete exponential family density to some observed
% data by gradient descent. Return the parameter theta.
%
% Discrete means we assume our observed data comes from a
% multinomial (that is, that the exponential family is a
% submanifold of the set of multinomial distributions with n
% outcomes, for some n). n can't be too large since we loop
% over it.
%
% Uses the identity that the gradient of the log likelihood is
% the difference between observed and expected values of the
% sufficient statistics.
%
% The probability model is that observation i has probability
% exp(theta . f_i) / Z(theta), where f_i is the i'th row of the
% feature matrix and Z(theta) is a normalizing constant.
%
% Note this code could use some help in (at least) the
% following areas:
% - adaptive learning rate
% - better termination criterion
%
% Args are:
% 1. Feature (or sufficient statistic) matrix. One column per
% parameter, one row per possible outcome. An example might
% be [x x.^2] where x = (-3:.1:3)'; this example would fit
% a discrete truncated Gaussian.
% 2. Observations. One column per parameter. Entry is
% average value of corresponding feature in training data.
% That is, if the empirical frequencies are p0, this is
% features'*p0. In our truncated Gaussian example, we
% would have [mean; mean^2+variance].
% 3. [optional] Initial value for parameters theta.
% a few parameters
epsilon = 1e-8; % accuracy requirement for solution
lrate = .2; % learning rate for gradient descent
maxiter = 100; % stop after this many iterations
[n, m] = size(features);
if (nargin == 2)
inittheta = zeros(m, 1);
end
theta = inittheta;
iter = 0;
pr = zeros(n,1);
while (iter < maxiter)
% compute probability distribution corresponding to theta
% taking care to avoid underflow
oldpr = pr;
logpr = features * theta;
logpr = logpr - max(logpr);
pr = exp(logpr);
pr = pr / sum(pr);
% compute features we should observe if theta were true;
% from that compute gradient wrt theta
expected = features' * pr;
grad = observed - expected;
% gradient descent and convergence test
theta = theta + lrate * grad;
if (sum(abs(oldpr-pr)) < epsilon)
break
end
iter = iter + 1;
fprintf('%3d:', iter);
fprintf(' %8.3f', theta);
fprintf(' ');
fprintf(' %8.3f', expected);
fprintf('\n');
end
plot(pr)