/* 

  ****************   NO WARRANTY  *****************

Since the Aspirin/MIGRAINES system is licensed free of charge,
the MITRE Corporation provides absolutley no warranty. Should
the Aspirin/MIGRAINES system prove defective, you must assume
the cost of all necessary servicing, repair or correction.
In no way will the MITRE Corporation be liable to you for
damages, including any lost profits, lost monies, or other
special, incidental or consequential damages arising out of
the use or inability to use the Aspirin/MIGRAINES system.

  *****************   COPYRIGHT  *******************

This software is the copyright of The MITRE Corporation. 
It may be freely used and modified for research and development
purposes. We require a brief acknowledgement in any research
paper or other publication where this software has made a significant
contribution. If you wish to use it for commercial gain you must contact 
The MITRE Corporation for conditions of use. The MITRE Corporation 
provides absolutely NO WARRANTY for this software.

   January, 1992 
   Russell Leighton
   The MITRE Corporation
   7525 Colshire Dr.
   McLean, Va. 22102-3481

*/

/*---------------------------------------------------------------------------*/
/*                   CDA - CANONICAL DISCRIMINANTS ANALYSIS                  */
/*---------------------------------------------------------------------------*/
/* 
	This program performs canonical discriminants analysis. Canonical
discriminants analysis takes groups of points in a high dimesional space and
determines the major directions of variation such that each group of points is
maximally separated. The canonical variates are labeled 0-(n-1) where n is the 
dimensionality of the space (e.g. the number of hidden units). 

The method for finding the canonical variates is divided into 4 steps:
	(1) finding the within-sum-of-squares for each group of points
        (2) finding the between-sum-of-squares of all groups
	(3) finding the eigenvectors (x) [the canonical variates] and 
 	    eigenvalues (@) [the canonical values] which satisfy 
            the equation:      Bx = @Wx
            where B is the between-sum-of-squares matrix and
                  W is the within-sum-of-squares matrix
	    The canonical variate with the greatest canonical value corresponds
	    to the direction in which the ratio of the between-group distance 
	    to the within-group distance is maximised.
	(4) projecting each of the initial input points onto the canonical
            variates.
Options:

-f <filename> - this is the file name in which the hidden unit patterns
                reside. The -f flag must be specified. Note that this
		name is also used as a root file for the canonical variates and
		canonical values files which the program creates. The filenames
 		are <filename>.- canonical_variates and 
 		<filename>.canonical_values respectively.
		The canonical_values indicate the magitude of the ratio of
		inter- to intra-distance. It provides a guide as to which 
		variates are most useful to analysis.
		If a file <filename>.canonical_variates already exists then cda
		assumes that this contains the canonical variates and reads this
		rather than recalculating them each time. The implication is 
		that if you change the hidden units file you must delete the 
		<filename>.canonical_variates file else cda will continue to 
		use the old canonical variates.

-i <filename> - this option allows you to project an arbitrary file on to
		the canonical variates. If the -i option is not specified
		the hidden units file will be used. This option is useful
		when you only want to view a subset of the total inputs.

-g <grpsfile> - the programs assumes that there is a groups file called
		<grpsfile> which contains the name of the group to which
		each point belongs. The default filename is the name of 
		the file which contains the hidden units patterns. If this 
		file does not exist the program will abort with an error 
		message.

-l <labsfile> - the programs assumes that there is a labels file called
		<labsfile> which contains labels which will be appended
		to each of the projected points in the output. This allows
		the output of cda to be directly piped into graph for display
		purposes. Note there must be at least as many labels as there
		are points in the hidden units file.

-x <number>   - these specify which variates to project the hidden unit 
-y <number>     patterns onto to produce the x, y and z outputs. An error will
-z <number>     occur if the y is specified but the x isn't or if the z is
		specified but the y isn't.
*/

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <errno.h>

#define MAXUNITS 200
#define MAXGROUPS 50
int	nl = 10;

double BetweenSSMatrix[MAXUNITS][MAXUNITS];
double WithinSSMatrix[MAXUNITS][MAXUNITS];
double TempSSMatrix[MAXUNITS][MAXUNITS];
double GroupColTotals[MAXGROUPS][MAXUNITS];
double ColumnTotals[MAXUNITS];
double EigenVectors[MAXUNITS][MAXUNITS];
double NrInGroups[MAXGROUPS];

int	MaximumRow = 0;
int	MaximumColumn = 0;
double	CurrentRow[1000];
int	Xcomponent = 999;
int	Ycomponent = 999;
int	Zcomponent = 999;
FILE	*HiddenUnitFile;
char	HiddenUnitFileName[100];
int	HiddenUnitFileFlag = 0;
FILE	*EigenVectorsFile;
char	EigenVectorsFileName[100];
FILE	*EigenValuesFile;
char	EigenValuesFileName[100];
FILE	*GroupsFile;
char	GroupsFileName[100];
int	GroupsFileFlag = 0;
FILE	*LabelsFile;
char	LabelsFileName[100];
int	LabelsFileFlag = 0;
char	InputFileName[100];
int	InputFileFlag = 0;
int	ErrorFlag = 0;

struct  GroupTable {
        char    Group[MAXGROUPS][100];
        int     NrOfGroups;
};
 
 
struct GroupTable       Groups;
 
#define TRUE 1
#define FALSE 0

int
lookahead(file)
FILE	*file;
{
	int	next;

	next = getc(file);
	ungetc(next, file);
	return(next);
}

InitGroupColTotals()
{
	int	i,j;

	for (i=0; i< MAXGROUPS; i++)
		for (j=0; j< MAXUNITS; j++)
			GroupColTotals[i][j] = 0.0;
}

InitColumnTotals()
{
	int	i;

	for (i=0; i< MAXUNITS; i++)
		ColumnTotals[i] = 0.0;
}

InitBetweenSSMatrix()
{
	int	i,j;

	for (i=0; i< MAXUNITS; i++)
		for (j=0; j< MAXUNITS; j++)
			BetweenSSMatrix[i][j] = 0.0;
}


InitWithinSSMatrix()
{
	int	i,j;

	for (i=0; i< MAXUNITS; i++)
		for (j=0; j< MAXUNITS; j++)
			WithinSSMatrix[i][j] = 0.0;

}

InitNrInGroups()
{
	int	i;

	for (i=0; i < MAXGROUPS; i++)
		NrInGroups[i] = 0.0;
}

InitJacobi()
{
	int	i,j;

	/* initialize the EigenVector matrix to the identity */
	for (i= 0; i < MaximumColumn; i++){
		for (j=0; j < MaximumColumn; j++)
			EigenVectors[i][j] = 0.0;
		EigenVectors[i][i] = 1.0;
		};

}

OpenGroupsFile()
{

	if (!GroupsFileFlag)
		sprintf(GroupsFileName, "%s.groups", HiddenUnitFileName);
	GroupsFile = fopen(GroupsFileName, "r");
	if (GroupsFile == NULL){
		printf("%s does not exit.\n", GroupsFileName);
		exit(-1);
	};
}

PrintBetweenSSMatrix()
{
	int	i, j;

	for (i=0; i< MaximumColumn; i++){
		for (j=0; j< MaximumColumn; j++)
			printf("%10.3lg ", BetweenSSMatrix[i][j]);
		printf("\n");
	};
}
		
PrintWithinSSMatrix()
{
	int	i, j;

	for (i=0; i< MaximumColumn; i++){
		for (j=0; j< MaximumColumn; j++)
			printf("%10.3lg ", WithinSSMatrix[i][j]);
		printf("\n");
		};

}

PrintNrInGroups()
{
	int	i;

	for (i=0; i < Groups.NrOfGroups; i++)
		printf("%s %f\n", Groups.Group[i], NrInGroups[i]);
}

int     GetGroupNr(groupname)
        char    groupname[100];
 
{
        int     i = 0;
        int     found = 0;
 
        while (!found && i < Groups.NrOfGroups) {
            found = (strcmp(groupname, Groups.Group[i]) == 0);
            i++;
        };
 
        if (found)
            return(i-1);
        else {
            strcpy(Groups.Group[i], groupname);
            Groups.NrOfGroups++;
            return(i);
        };
 
}

/*****************************************************************************/
/*                              INITIALIZE				     */
/*****************************************************************************/

Initialize()
{
	InitWithinSSMatrix();
	InitGroupColTotals();
	InitColumnTotals();
	InitNrInGroups();
}


/*****************************************************************************/
/*      CALCULATE WITHIN-SUM-OF-SQUARES and BETWEEN-SUM-OF-SQUARES MATRICES  */
/*****************************************************************************/

CalculateSSMatrices()
/* Calculates the within-sum-of-squares and between-sum-of-squares matrices */

{
	double	Number;
	int	col, groupNr, i, j, k;
	char	groupname[100];

	while (lookahead(HiddenUnitFile) != EOF){
 		if (lookahead(GroupsFile) == EOF){
			printf("The group file does not identified all points in the hidden units file.\n");
			exit(-1);
		};

		/* Get group identifier from group file */
		if (fscanf(GroupsFile, "%[^\n]\n", groupname) == nl){
			printf("Error in groups file - group name missing.\n");
			exit(-1);
		}
		else {
			groupNr = GetGroupNr(groupname);
			NrInGroups[groupNr] = NrInGroups[groupNr] + 1.0;
		};

		col = 0;
		while ((lookahead(HiddenUnitFile) != EOF) && (lookahead(HiddenUnitFile) != nl)) {
			if (fscanf(HiddenUnitFile, "%lg", &Number) != 1){
				printf("Error in hidden unit file.\n");
				exit(-1);
				};

			/* Sigma Xi */
			GroupColTotals[groupNr][col] = GroupColTotals[groupNr][col] + Number;

			/* Add to row memory */
			CurrentRow[col] = Number;

			/* Calculate XiXj terms */
			for (i=0; i<= col; i++){
				WithinSSMatrix[col][i] += CurrentRow[i] * Number;
				if (col != i) 
					WithinSSMatrix[i][col] += CurrentRow[i] * Number;
				};
			col = col + 1;
			if (col >= (MAXUNITS-1)){
				printf("cda can handle a maximum of %d hidden units.\n", MAXUNITS);
				exit(-1);
				};
			/* soak up non newline whitespace */
			while (lookahead(HiddenUnitFile) == ' ' || lookahead(HiddenUnitFile) == '\t')
				getc(HiddenUnitFile);
		};
		if (lookahead(HiddenUnitFile) == nl){
			getc(HiddenUnitFile); /* soak up nl */
			if (col > MaximumColumn)
				MaximumColumn = col;
			MaximumRow = MaximumRow + 1;
			}
		};

		/* Calculate Column totals */
		for (i=0; i<= MaximumColumn; i++)
		    for (j=0; j<= Groups.NrOfGroups; j++)
			ColumnTotals[i] += GroupColTotals[j][i];

		/* Calculate sqr(Sigma X) */
		for (i=0; i<= MaximumColumn; i++)
		    for (j=0; j<= MaximumColumn; j++)
			TempSSMatrix[i][j] = ColumnTotals[i]*ColumnTotals[j];

		/* subtract Sigma Xi Sigma Xj / N  */
		for (i=0; i< MaximumColumn; i++)
			for (j=0; j < MaximumColumn; j++) {
				BetweenSSMatrix[i][j] = - TempSSMatrix[i][j] / (double) MaximumRow;
				for (k=0; k < Groups.NrOfGroups; k++) {
					BetweenSSMatrix[i][j] = (BetweenSSMatrix[i][j] + GroupColTotals[k][i] * GroupColTotals[k][j]/ NrInGroups[k]);
					WithinSSMatrix[i][j] = (WithinSSMatrix[i][j] - GroupColTotals[k][i] * GroupColTotals[k][j]/ NrInGroups[k]);
				};
			};
}

LoadEigenVectors()
{
	double	Number;
	int	i, j;

	j = 0;
	EigenVectorsFile = fopen(EigenVectorsFileName, "r");
	if (EigenVectorsFile == NULL){
		printf("%s does not exit.\n", EigenVectorsFileName);
		exit(-1);
		};


	while (lookahead(EigenVectorsFile) != EOF){
		i = 0;
		while ((lookahead(EigenVectorsFile) != EOF) && (lookahead(EigenVectorsFile) != nl)) {
			if (fscanf(EigenVectorsFile, "%lg", &Number) != 1){
				printf("Error in canonical_variates file.\n");
				exit(-1);
				};
			EigenVectors[i][j] = Number;
			i = i + 1;
			if (i >= (MAXUNITS-1)){
				printf("pca can handle a maximum of %d hidden units.\n", MAXUNITS);
				exit(-1);
				};
			/* soak up non newline whitespace */
			while (lookahead(EigenVectorsFile) == ' ' || lookahead(EigenVectorsFile) == '\t')
				getc(EigenVectorsFile);
		};
		if (lookahead(EigenVectorsFile) == nl){
			getc(EigenVectorsFile); /* soak up nl */
			j = j + 1;
			MaximumColumn = j;
			}
		}

	fclose(EigenVectorsFile);
}

SaveEigenVectors()
{
	int	i, j;

	EigenVectorsFile = fopen(EigenVectorsFileName, "w");
	for (i=0; i< MaximumColumn; i++){
		for (j=0; j< MaximumColumn; j++)
			fprintf(EigenVectorsFile, "%lg ", EigenVectors[j][i]);
		fprintf(EigenVectorsFile, "\n");
		};

	fclose(EigenVectorsFile);
}

SaveEigenValues()
{
	int	i;

	sprintf(EigenValuesFileName, "%s.canonical_values", HiddenUnitFileName);
	EigenValuesFile = fopen(EigenValuesFileName, "w");
	for (i=0; i< MaximumColumn; i++){
			fprintf(EigenValuesFile, "%lg ", WithinSSMatrix[i][i]);
		};
	fprintf(EigenValuesFile, "\n");
}


/*****************************************************************************/
/*                      FIND EIGENVECTORS AND EIGENVALUES                    */
/*****************************************************************************/

#define accuracy 0.000000001

EvalJacobi()
/* calculate the eigensystem using the jacobi method for symmetric matrices.
   Taken from "COMPACT NUMERICAL METHODS FOR COMPUTERS: linear algebra and
   function minimization" by J. C. Nash
*/
{
	int	i,j, count, m, n, k, limit, oki, okj;
	double	p, q, t, s, c, r;
	int	rotn;

	limit = 50;
	m =0;
	n=MaximumColumn;

	for (count=0; (count <= limit) && (m < (n*(n-1)/2)); count++){
		m = 0; /* init number of rotations skipped during sweep */
		for (i = 0; i < (n-1); i++)
			for (j=(i+1); j<n; j++){
				rotn = TRUE;
				p = 0.5 * (WithinSSMatrix[i][j]+WithinSSMatrix[j][i]);
				q = WithinSSMatrix[i][i]-WithinSSMatrix[j][j];
				t = sqrt(4.0*p*p+q*q);
				if (fabs(t) < accuracy){ /* ie t = 0 */
					rotn = FALSE;
					}
				else {
					if (q >= 0.0){
						oki = ((fabs(WithinSSMatrix[i][i])+100.0*fabs(p) - fabs(WithinSSMatrix[i][i]))  < accuracy);
						okj = ((fabs(WithinSSMatrix[j][j])+100.0*fabs(p) - fabs(WithinSSMatrix[j][j]))  < accuracy);
						if (oki && okj) rotn = FALSE;
						else rotn = TRUE;

						if (rotn){
							c = sqrt((t+q)/(2.0*t));
							s=p/(t*c);
							};
						}
					else {
						rotn = TRUE;
						s = sqrt((t-q)/(2.0*t));
						if (p<0.0) s = -s;
						c=p/(t*s);
						};

					if ((1.0 + fabs(s) - 1.0) < accuracy) rotn = FALSE;

					if (rotn){
						/* perform rotation */
						for (k = 0; k < n; k++){
							q = WithinSSMatrix[i][k];
							WithinSSMatrix[i][k] = c * q + s * WithinSSMatrix[j][k];
							WithinSSMatrix[j][k] = -s *q+c*WithinSSMatrix[j][k];
							};
						for (k=0; k <n; k++){
							q = WithinSSMatrix[k][i];
							WithinSSMatrix[k][i] = c*q+s*WithinSSMatrix[k][j];
							WithinSSMatrix[k][j] = -s*q+c*WithinSSMatrix[k][j];
							r = EigenVectors[k][i];
							EigenVectors[k][i] = c*r+s*EigenVectors[k][j];
							EigenVectors[k][j] = -s*r+c*EigenVectors[k][j];
							};
						}
					else
						m=m+1;
				};
			};
	};
	if (count == limit) {
		printf("Eigen system may not have been calculated accurately. That is the algorithm has not converged. \n");
	};
}

GenEvalJacobi()
/* calculate the eigensystem using the jacobi method for general matrices.
   Taken from "COMPACT NUMERICAL METHODS FOR COMPUTERS: linear algebra and
   function minimization" by J. C. Nash
*/

{
	int 	i, j, k, m;
	double	s;
	
	InitJacobi();

	EvalJacobi();
	
	for (i = 0; i < MaximumColumn; i++) {
		if (WithinSSMatrix[i][i] <= 0.0)
			WithinSSMatrix[i][i] = 0.0000001;
		s = 1.0/sqrt(WithinSSMatrix[i][i]);
		for(j = 0; j < MaximumColumn; j++)
			EigenVectors[j][i] = s * EigenVectors[j][i];
	};

	for (i = 0; i < MaximumColumn; i++) {
		for (j = 0; j < MaximumColumn; j++) {
			s = 0.0;
			for (k = 0; k < MaximumColumn; k++)
				for (m = 0; m < MaximumColumn; m++)
					s = s + EigenVectors[k][i]*BetweenSSMatrix[k][m]*EigenVectors[m][j];
			WithinSSMatrix[i][j] = s;
			WithinSSMatrix[j][i] = s;
		};
	};

	EvalJacobi();
}

/*****************************************************************************/
/*                  PROJECT HIDDEN VECTORS ONTO COMPONENTS                   */
/*****************************************************************************/

double
Project(HiddenVector, EigenVectorNumber) 
double	HiddenVector[MAXUNITS];
int	EigenVectorNumber;
{
	double	getlen();
	double	eiglength;
	double	proj;
	int	i;
	double	ss;

	eiglength = getlen(EigenVectorNumber);

	ss =0.0;
	for (i=0; i<MaximumColumn; i++) {
		ss += HiddenVector[i] * EigenVectors[i][EigenVectorNumber];
	}
	proj = ss / eiglength;
	return(proj);
}

double
getlen(EigenVectorNumber) 
    int	EigenVectorNumber;
{
    register int i;
    double	ss;
    double	length;

    ss = 0.0;
    for (i=0; i<MaximumColumn; i++) {
	    ss += EigenVectors[i][EigenVectorNumber] * EigenVectors[i][EigenVectorNumber];
    }
    length = sqrt(ss);
    return(length);
}

ProjectHiddenUnitPatterns()
{
	double	Number;
	int	col, i, j;
	double	HiddenVector[MAXUNITS];
	char	label[100];

	rewind(HiddenUnitFile);

	while (lookahead(HiddenUnitFile) != EOF){
		col = 0;
		while ((lookahead(HiddenUnitFile) != EOF) && (lookahead(HiddenUnitFile) != nl)) {
			if (fscanf(HiddenUnitFile, "%lg", &Number) != 1){
				printf("Error in the input file. Note if no input file was specified this means there is an error in the hidden units file.\n");
				exit(-1);
				};
			HiddenVector[col] = Number;
			col = col + 1;
			if (col >= (MAXUNITS-1)){
				printf("cda can handle a maximum of %d hidden units.\n", MAXUNITS);
				exit(-1);
				};
			/* soak up non newline whitespace */
			while (lookahead(HiddenUnitFile) == ' ' || lookahead(HiddenUnitFile) == '\t')
				getc(HiddenUnitFile);
		};
		if (lookahead(HiddenUnitFile) == nl){
			getc(HiddenUnitFile); /* soak up nl */
			if (Xcomponent != 999)
				printf("%g ", Project(HiddenVector, Xcomponent));
			if (Ycomponent != 999)
				printf("%g ", Project(HiddenVector, Ycomponent));
			if (Zcomponent != 999)
				printf("%g ", Project(HiddenVector, Zcomponent));
			if (LabelsFileFlag){
				if (fscanf(LabelsFile, "%[^\n]\n", label) == EOF){
					printf("\nNo more labels.\n");
					exit(-1);
					};
				printf("\"%s\"", label);
				};
			if (Xcomponent != 999 || Ycomponent != 999 || Zcomponent != 999 || LabelsFileFlag)
				printf("\n");
			}
		}

}

/*****************************************************************************/
/*                       PROCESS COMMAND LINE OPTIONS                        */
/*****************************************************************************/

ProcessCommandLineOptions(argc, argv)
int argc;
char **argv;
{
	extern char *optarg;
	extern int optind;
	int c;

	while ((c = getopt(argc, argv, "x:y:z:f:g:l:i:")) != -1)
	    switch (c) {
	    case 'f':
			HiddenUnitFileFlag++;
			sscanf(optarg, "%s", HiddenUnitFileName);
			break;
	    case 'i':
			InputFileFlag++;
			sscanf(optarg, "%s", InputFileName);
			break;
	    case 'g':
			GroupsFileFlag++;
			sscanf(optarg, "%s", GroupsFileName);
			break;
	    case 'l':
			LabelsFileFlag++;
			sscanf(optarg, "%s", LabelsFileName);
			break;
	    case 'x': 
			sscanf(optarg, "%d", &Xcomponent);
			break;
	    case 'y': 
			sscanf(optarg, "%d", &Ycomponent);
			break;
	    case 'z': 
			sscanf(optarg, "%d", &Zcomponent);
			break;
	    case '?':
			ErrorFlag++;
	    };

	if (ErrorFlag) {
	    (void) fprintf(stderr, "usage: cda -f <HiddenUnitFile> [-g <GroupsFile>] [-l <LabelsFile>] [-x <number> [-y <number> [-z <number>]]]\n");
	    exit (1);
	};

	if (!HiddenUnitFileFlag){
		printf("No hidden unit file was supplied.\n");
		exit(-1);
		};
	HiddenUnitFile = fopen(HiddenUnitFileName, "r");
	if (HiddenUnitFile == NULL){
		printf("%s does not exit.\n", HiddenUnitFileName);
		exit(-1);
	};

	if (LabelsFileFlag){
		LabelsFile = fopen(LabelsFileName, "r");
		if (LabelsFile == NULL){
			printf("%s does not exit.\n", LabelsFileName);
			exit(-1);
			};
	};

	if (Ycomponent != 999 && Xcomponent == 999){
		printf("If Y component is specified X component must be also.\n");
		exit(-1);
		};

	if (Zcomponent != 999 && Ycomponent == 999){
		printf("If Z component is specified then Y component must be also.\n");
		exit(-1);
		};
}

AreComponentsLegal()
{
	if (Xcomponent != 999 && (Xcomponent < 0 || Xcomponent >= MaximumColumn)){
		printf("X component was not legal. Components should be between 0 and %d.\n", MaximumColumn-1);
		exit(-1);
		};
	if (Ycomponent != 999 && (Ycomponent < 0 || Ycomponent >= MaximumColumn)){
		printf("Y component was not legal. Components should be between 0 and %d.\n", MaximumColumn-1);
		exit(-1);
		};
	if (Zcomponent != 999 && (Zcomponent < 0 || Zcomponent >= MaximumColumn)){
		printf("Z component was not legal. Components should be between 0 and %d.\n", MaximumColumn-1);
		exit(-1);
		};

}

main(argc, argv)
int	argc;
char	*argv[];
{

	Initialize();

	ProcessCommandLineOptions(argc, argv);

	/* if the canonical variates file exists then load those canonical_variates else 
	   calculate the canonical variates */
	sprintf(EigenVectorsFileName, "%s.canonical_variates", HiddenUnitFileName);
	if (!access(EigenVectorsFileName, 3)){
		printf("Unable to access canonical variates file (i.e. not allowed to create it..\n");
		exit(1);
		};
	if (errno != ENOENT)
		LoadEigenVectors();
	else {
		OpenGroupsFile();
		CalculateSSMatrices();
		GenEvalJacobi();
		SaveEigenVectors();
		SaveEigenValues();
		};

	AreComponentsLegal();
	if (InputFileFlag){
		HiddenUnitFile = fopen(InputFileName, "r");
		if (HiddenUnitFile == NULL){
			printf("Input file does not exist.\n");
			exit(-1);
			};
		};
	ProjectHiddenUnitPatterns();
}
