function[mu, s, w] = emcluster(data, clusters, ploton); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % constants iters = 1000; eps = 0.0001; badValue = 100; end_condition = 1; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % initialize variables [rows,cols] = size(data); mus = rand(clusters, cols); weights = rand(clusters, 1); classes = rand(rows, clusters); s = cell(clusters); likelihood = 0; hoods = ones(1, iters); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % set up s as a cell arrays of daig cov matrixes for i=1:clusters cov = rand(1, cols); s{i} = diag(cov); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % fix the data/missing data % make it just average avgs = ones(cols, 1); for j=1:cols sum = 0; count = 0; for i=1:rows x = data(i,j); if (x < badValue) sum = sum + x; count = count + 1; end end avgs(j,1) = sum/count; end for j=1:cols for i=1:rows x = data(i,j); if (x > badValue) data(i,j) = avgs(j,1); end end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% for iter=1:iters fprintf(1, 'iter=%d\n', iter); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %E step, gives us expected classes for i=1:rows X = data(i,:); for j=1:clusters top = 0; bot = 0; Sigma = s{j}; mu = mus(j,:); %multivariate normal p = mvnpdf(X,mu,Sigma); top = p*weights(j,:) ; for myK=1:clusters Sigma = s{myK}; mu = mus(myK,:); %multivariate normal p = mvnpdf(X,mu,Sigma); myGauss = p*weights(myK,:); bot = bot + myGauss; end Z = top/bot; classes(i,j) = Z; end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %M step, gives us mu, cov and wieght % do the mu's mus_old = mus; for i=1:clusters top = 0; bot = 0; for j=1:rows top = top + classes(j,i)*data(j,:); bot = bot + classes(j,i); end mus(i,:) = top/bot; end % do the cov's for i=1:clusters top = 0; bot = 0; for j=1:rows X = data(j,:); Xsqr = (X - mus(i,:))'.^2; Xsqr = diag(Xsqr); top = top + classes(j,i)*Xsqr; bot = bot + classes(j,i); end s{i} = top/bot; for y=1:cols for z=1:cols if (y == z) if (s{i}(y,z) < eps) s{i}(y,z) = eps; end end end end end % do the weight's for i=1:clusters top = 0; bot = 0; for j=1:rows top = top + classes(j,i); end bot = rows; weights(i) = top/bot; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % print cur log likelihood (should be montonic increasing) likelihood = 0; for i=1:rows cluster_sum = 0; for j=1:clusters X = data(i,:); Sigma = s{j}; mu = mus(j,:); cluster_sum = cluster_sum + weights(j,:)*mvnpdf(X,mu,Sigma); end likelihood = likelihood + log(cluster_sum); end hoods(1,iter) = likelihood; % check for mono if (iter > 1) if (hoods(1,iter) < hoods(1,iter-1)) disp ('not mono'); return end if (hoods(1,iter) - hoods(1,iter-1) <= end_condition) % end break; end end %%%%%%%%%%%%%%%% % debug debug = 0; if (debug == 1 && iter > 1) iter hoods(1,iter) hoods(1,iter) - hoods(1,iter-1) mus end end % done % do plotting, if neccesary if (ploton == 1) x = 1:iter; y = hoods(1:iter); plot(x,y); xlabel('Iteration'); ylabel('Log likelihood'); title('Log likelihood vs Iteration'); end % prepare output variables mu = mus; temp = ones(clusters, cols); for j=1:clusters temp(j,:) = diag(s{j})'; end s = temp; w = weights;