% loadKBcsv(entityType, ncols, filename)
%
% Load a KB which has been saved in comma-separated values format, with first line specifying the
% fields and remaining lines specifying values (one entity per line).  entityType is a string name
% of the entity that is the generalization of all entities loaded from the KB.  ncols is an integer
% giving the number of columns in the CSV file (i.e., the number of slots defined for each entity).
% filename is the full path of the csv file to be loaded.  
%
% Returns: a cell array containing the names of the loaded entities
%
% Efficiency: requires 118 sec to load and create 10,000 entities
% with 9 slots each. 
%
% When loading large files, first set THEO.maintainDiskHierarchy,
% THEO.maintainDiskKB to zero for efficiency.
%
% Example: following loads data from income.csv, assuming each
% example entity has 9 relevant slot values
%
% loadKBcsv('personWithIncome',9,'income.csv');
%
% Created 12/30/08 Tom
%

% entityType='personWithIncome'; ce('personWithIncome','everything'); ncols=9
% filename='/Users/tommitchell/Documents/MATLAB/Theo/TheoKBs/incomeRawData/income.tiny.10.discrete.txt'
% filename='/Users/tommitchell/Documents/MATLAB/Theo/TheoKBs/incomeRawData/income.train.10k.discrete.txt'
% filename='/Users/tommitchell/Documents/MATLAB/Theo/TheoKBs/incomeRawData/income.train.1k.discrete.txt'
% filename='/Users/tommitchell/Documents/MATLAB/Theo/TheoKBs/incomeRawData/income.train.250.discrete.txt'
% loadKBcsv(entityType, ncols, filename)


function loadedEntities = loadKBcsv(entityType, ncols, filename)
global THEO
loadedEntities={};
maintainDiskHierarchy_originalVal=THEO.maintainDiskHierarchy;
THEO.maintainDiskHierarchy=0; % don't update the html ontology page
                              % until the kb is fully loaded into memory

% 1. read in the csv file. The result is a cell array C, where C{k}
% is a cell array containing the kth column in the file.
readstr='%s';
for k=2:ncols
  readstr=[readstr, ' %s'];
end

fid = fopen(filename);
C = textscan(fid, readstr, 'delimiter',',');
fclose(fid);
rslt=C;
nRows=length(C{1});
fprintf('loading %s\n Each ''.'' indicates loading of %d entities.\n', ...
        filename, round(nRows/50));

% 2, for each row except the first (which gives slot names), create an
% entity and add its slot values
nEnts=length(C{1})-1;
for k=1:nEnts
  % the next two lines are a bit of a hack to avoid overwriting existing entities. Avoiding such
  % overwriting should actually be handled inside createEntity?
  entNum=1+gv('maxGensymNum',entityType);
  pv('maxGensymNum',entityType,entNum);
  
  ent=[entityType, int2str(entNum)];
  createEntity(ent,entityType);
  loadedEntities{end+1}=ent;
  for c=1:ncols
    putValue(C{c}{1}, ent, C{c}{k+1});
  end
  if 1==mod(k,round(nRows/50))
    fprintf('.');
  end
end
fprintf('\n');

% Finished loading the KB.
% Next, restore the global var and perhaps update HTML summary of KB
THEO.maintainDiskHierarchy=maintainDiskHierarchy_originalVal;
if THEO.maintainDiskHierarchy
  webdisplayHierarchy('everything');
end
