% Generates a random list of category tuples based on the categorty sizes.
% This is used to generate random datasets of smaller number of roughly
% equal-sized categories for clustering experiments from a large dataset of
% many categories.
% 
% Input:
% labels - the vector of labels of the instances in the original dataset
% numcats - the number of categories per dataset
% listsize - the number of datasets you want to have in the list
% maxratio - largest:smallest category size ratio in any generated dataset
%
% Output:
% list - matrix of the list of generated datsets; each row is a dataset
% sizes - same dimensions as list - with sizes of corresponding categories
% 
% Author: Frank Lin (frank@cs.cmu.edu)

function [list,sizes]=x_randcats_list(labels,numcats,listsize,maxratio)

totalcats=max(labels);

fprintf('loading category sizes...\n');
catsizes=hist(labels,totalcats);

list=sparse(listsize,numcats);

count=0;
while count<listsize
    fprintf('generating permutations...\n');
    cands=zeros(totalcats,numcats);
    for i=1:numcats
        cands(:,i)=randperm(totalcats)';
    end
    for i=1:totalcats
        cand=unique(cands(i,:));
        candsizes=catsizes(cand);
        if length(cand)==numcats&&max(candsizes)/min(candsizes)<=maxratio
            count=count+1;
            list(count,:)=cand;
        end
    end
    
    if count>=listsize
        list=unique(list,'rows');
        count=size(list,1);
    end
end

list=full(list(1:listsize,:));
sizes=catsizes(list);

end