function [theta, alpha] = trainPMM(X, K, theta0, lambda, thetaInit, alphaInit)
% X is an nxV matrix, y is an nx1 vector
% This function returns
% theta: a Kx1 vector indicating the class probabilities
% alpha: a KxV matrix

  % Prelims
  V = size(X, 2);
  numData = size(X, 1);
  numEMIters = 10;

  % Perform EM 
  theta = thetaInit;
  alpha = alphaInit;
  for emIter = 1:numEMIters
    fprintf('EM Iter: %d\n', emIter);
    [theta, alpha] = emPMM(X, K, theta0, lambda, theta, alpha); 
  end

end


% This function performs EM
function [theta, alpha] = emPMM(X, K, theta0, lambda, thetaPrev, alphaPrev)

  % prelims
  n = size(X, 1);
  V = size(X, 2);

  % E-step
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  % First obtain the class log likelihoods
  classLogLs = zeros(n, K);
  for k = 1:K
    classLogLs(:, k) = classLogLikelihoods(X, alphaPrev(k, :));
  end
  % Add the prior to obtain the joint
  classLogJoints = bsxfun(@plus, classLogLs, log(thetaPrev'));
  shiftClassLogJoints = ...
    bsxfun(@minus, classLogJoints, max(classLogJoints, [], 2));
  shiftLogJoints = exp(shiftClassLogJoints);
  R = bsxfun(@rdivide, shiftLogJoints, sum(shiftLogJoints, 2));
%   logJoints = log_sum_exp(classLogJoints')';
%   R = exp( bsxfun(@minus, classLogJoints, logJoints) );
  S = sum(R);

  % M-step
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  % First theta
  theta = theta0 + S' -1;
  theta = theta / sum(theta);
  % Then alpha
  alpha = zeros(K, V);
  for k = 1:K
  % Iterate through each class and obtain the alpha_k's
    alpha(k, :) = newtonRaphsonPMM(X, R(:,k), S(k), lambda);
  end

end


% This function implements Newton's Method.
function [alphak] = newtonRaphsonPMM(X, Rk, Sk, lambda)

  % Prelims
  numNRIters = 10; % Just use 10 iterations of NR
  V = size(X, 2); % size of vocabulary
  n = size(X, 1); % number of training data in this class
  m = sum(X, 2); % number of words in each documents

  % Set up initializations
  initPt = sum( bsxfun(@times, X, Rk) );
  initPt = initPt / sum(initPt);

    nrProgress = zeros(numNRIters, 1);
    alphak = initPt; % alphak in the current iteration
    for nrIter = 1:numNRIters
      % Compute the following
      Ak = sum(alphak);
      XplusAlpha = bsxfun(@plus, X, alphak);
      % The gradient
      g = Sk * psi(Ak) - Rk' * psi(m + Ak) + Rk' * psi(XplusAlpha) ...
          - Sk * psi(alphak) - 2 * lambda * alphak;
      % The value z ( see solutions)
      z = Sk * psi(1, Ak) - Rk' * psi(1, m + Ak);
      % The diagonal of the Hessian
      D = Rk' * psi(1, XplusAlpha) - Sk * psi(1, alphak) - 2*lambda;
      % Newton's step update
      Hinvg = g./D - (1./D) * sum(g./D) / (1/z + sum(1./D));
      alphak = alphak - 1*Hinvg;

      % DEBUG
      nrProgress(nrIter) = Rk' * classLogLikelihoods(X, alphak);

    end
%     nrProgress,

end

