/* 

  ****************   NO WARRANTY  *****************

Since the Aspirin/MIGRAINES system is licensed free of charge,
the MITRE Corporation provides absolutley no warranty. Should
the Aspirin/MIGRAINES system prove defective, you must assume
the cost of all necessary servicing, repair or correction.
In no way will the MITRE Corporation be liable to you for
damages, including any lost profits, lost monies, or other
special, incidental or consequential damages arising out of
the use or inability to use the Aspirin/MIGRAINES system.

  *****************   COPYRIGHT  *******************

This software is the copyright of The MITRE Corporation. 
It may be freely used and modified for research and development
purposes. We require a brief acknowledgement in any research
paper or other publication where this software has made a significant
contribution. If you wish to use it for commercial gain you must contact 
The MITRE Corporation for conditions of use. The MITRE Corporation 
provides absolutely NO WARRANTY for this software.

   January, 1992 
   Russell Leighton
   The MITRE Corporation
   7525 Colshire Dr.
   McLean, Va. 22102-3481

*/
/*



                   PCA - PRINCIPAL COMPONENTS ANALYSIS

			by Simon Dennis 5/4/91

*/


/* 
	This program performs principal components analysis. Principle 
components analysis takes a set of points in a high dimesional space and
determines the major components of variation. The principal components are
labeled 0-(n-1) where n is the dimensionality of the space (i.e. the
number of hidden units). 

Finding the principal components entails 3 steps:
	(1) calculate the covariance matrix
	(2) find the eigenvectors of this matrix (these are the principal 
		components).
	(3) project each of the initial input points onto the principal
		components.
Options:

-f <filename> - this is the file name in which the hidden unit patterns
                reside. The -f flag must be specified. Note that this
		name is also used as a root file for the eigenvectors and
		eigenvalues files which the program creates. 
		<filename>.principal_components - contains the principal 
		components 
		<filename>.principal_values - contains the principal_values 
		The principal_values can be used to give a guide of the amount 
		of variance accounted for by each component. Hence it gives
		a rough guide as to what may be the useful principal
		components to look at.
		If a file <filename>.principal_components already exists then 
		pca assumes that this contains the principal_components and 
		reads this rather than recalculating them each time. The
		implication is that if you change the hidden units file
		you must delete the <filename>.principal_components file else
		pca will continue to use the old eigenvectors.

-i <filename> - this option allows you to project an arbitrary file on to
		the principal components. If the -i option is not specified
		the hidden unit file will be used. This option is useful
		when you only want to view a subset of the total inputs.

-l <labsfile> - the programs assumes that there is a labels file called
		<labsfile> which contains labels which will be appended
		to each of the projected points in the output. This allows
		the output of pca to be directly piped into graph for display
		purposes. Note there must be at least as many labels as there
		are points in the hidden unit file.

-x <number>   - these specify which components to project the hidden unit 
-y <number>     patterns onto to produce the x, y and z outputs. An error will
-z <number>     occur if the y is specified but the x isn't or if the z is
		specified but the y isn't.
*/

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <errno.h>

#define MAXUNITS 200
int	nl = 10;

double CovarianceMatrix[MAXUNITS][MAXUNITS];
double ColumnTotals[MAXUNITS];
double EigenVectors[MAXUNITS][MAXUNITS];

int	MaximumRow = 0;
int	MaximumColumn = 0;
double	CurrentRow[1000];
int	Xcomponent = 999;
int	Ycomponent = 999;
int	Zcomponent = 999;
int	SummaryFlag = 0;
FILE	*HiddenUnitFile;
char	HiddenUnitFileName[100];
int	HiddenUnitFileFlag = 0;
FILE	*EigenVectorsFile;
char	EigenVectorsFileName[100];
FILE	*EigenValuesFile;
char	EigenValuesFileName[100];
FILE	*LabelsFile;
char	LabelsFileName[100];
char	InputFileName[100];
int	InputFileFlag = 0;
int	LabelsFileFlag = 0;
int	ErrorFlag = 0;

#define TRUE 1
#define FALSE 0

int
lookahead(file)    /* return the next character on the given stream without 
                      removing it. */
FILE	*file;
{
	int	next;

	next = getc(file);
	ungetc(next, file);
	return(next);
}

/*****************************************************************************/
/*                             INITIALIZATION                                */
/*****************************************************************************/

InitColumnTotals()
{
	int	i;

	for (i=0; i< MAXUNITS; i++)
		ColumnTotals[i] = 0.0;
}

InitCovarianceMatrix()
{
	int	i,j;

	for (i=0; i< MAXUNITS; i++)
		for (j=0; j< MAXUNITS; j++)
			CovarianceMatrix[i][j] = 0.0;

}

Initialize()
{
	InitCovarianceMatrix();
	InitColumnTotals();
}


/*****************************************************************************/
/*                       CALCULATE COVARIANCE MATRIX                         */
/*****************************************************************************/

CalculateCovarianceMatrix()
{
	double	Number;
	int	col, i, j;

	while (lookahead(HiddenUnitFile) != EOF){
		col = 0;
		while ((lookahead(HiddenUnitFile) != EOF) && (lookahead(HiddenUnitFile) != nl)) {
			if (fscanf(HiddenUnitFile, "%lg", &Number) != 1){
				printf("Error in hidden unit file.\n");
				exit(-1);
				};

			/* Sigma Xi */
			ColumnTotals[col] = ColumnTotals[col] + Number;

			/* Add to row memory */
			CurrentRow[col] = Number;

			/* Calculate XiXj terms */
			for (i=0; i<= col; i++){
				CovarianceMatrix[col][i] += CurrentRow[i] * Number;
				if (col != i) 
					CovarianceMatrix[i][col] += CurrentRow[i] * Number;
				};
			col = col + 1;
			if (col >= (MAXUNITS-1)){
				printf("pca can handle a maximum of %d hidden units.\n", MAXUNITS);
				exit(-1);
				};
			/* soak up non newline whitespace */
			while (lookahead(HiddenUnitFile) == ' ' || lookahead(HiddenUnitFile) == '\t')
				getc(HiddenUnitFile);
		};
		if (lookahead(HiddenUnitFile) == nl){
			getc(HiddenUnitFile); /* soak up nl */
			if (col > MaximumColumn)
				MaximumColumn = col;
			MaximumRow = MaximumRow + 1;
			}
		}


		/* subtract Sigma Xi Sigma Xj / N  */
		for (i=0; i< MaximumColumn; i++)
			for (j=0; j < MaximumColumn; j++)
				CovarianceMatrix[i][j] = (CovarianceMatrix[i][j] - ColumnTotals[i] * ColumnTotals[j]/ (double) MaximumRow) / (double) MaximumRow;
}

LoadEigenVectors()
{
	double	Number;
	int	i, j;

	j = 0;
	EigenVectorsFile = fopen(EigenVectorsFileName, "r");
	if (EigenVectorsFile == NULL){
		printf("%s does not exit.\n", EigenVectorsFileName);
		exit(-1);
		};


	while (lookahead(EigenVectorsFile) != EOF){
		i = 0;
		while ((lookahead(EigenVectorsFile) != EOF) && (lookahead(EigenVectorsFile) != nl)) {
			if (fscanf(EigenVectorsFile, "%lg", &Number) != 1){
				printf("Error in principal_components file.\n");
				exit(-1);
				};
			EigenVectors[i][j] = Number;
			i = i + 1;
			if (i >= (MAXUNITS-1)){
				printf("pca can handle a maximum of %d hidden units.\n", MAXUNITS);
				exit(-1);
				};
			/* soak up non newline whitespace */
			while (lookahead(EigenVectorsFile) == ' ' || lookahead(EigenVectorsFile) == '\t')
				getc(EigenVectorsFile);
		};
		if (lookahead(EigenVectorsFile) == nl){
			getc(EigenVectorsFile); /* soak up nl */
			j = j + 1;
			MaximumColumn = j;
			}
		}

	fclose(EigenVectorsFile);
}

SaveEigenVectors()
{
	int	i, j;

	EigenVectorsFile = fopen(EigenVectorsFileName, "w");
	for (i=0; i< MaximumColumn; i++){
		for (j=0; j< MaximumColumn; j++)
			fprintf(EigenVectorsFile, "%lg ", EigenVectors[j][i]);
		fprintf(EigenVectorsFile, "\n");
		};

	fclose(EigenVectorsFile);
}

SaveEigenValues()
{
	int	i;

	sprintf(EigenValuesFileName, "%s.principal_values", HiddenUnitFileName);
	EigenValuesFile = fopen(EigenValuesFileName, "w");
	for (i=0; i< MaximumColumn; i++){
			fprintf(EigenValuesFile, "%lg ", CovarianceMatrix[i][i]);
		};
	fprintf(EigenValuesFile, "\n");
}


/*****************************************************************************/
/*                      FIND EIGENVECTORS AND EIGENVALUES                    */
/*****************************************************************************/

#define accuracy 0.000000001

CalculateEigenSystem()
/* calculate the eigensystem using the jacobi method 
   taken from "COMPACT NUMERICAL METHODS FOR COMPUTERS: linear algebra and
   function minimization" by J. C. Nash
*/
{
	int	i,j, count, m, n, k, limit, oki, okj;
	double	p, q, t, s, c, r;
	int	rotn;

	limit = 50;
	m =0;
	n=MaximumColumn;
	/* initialize the EigenVector matrix to the identity */
	for (i= 0; i < n; i++){
		for (j=0; j<n; j++)
			EigenVectors[i][j] = 0.0;
		EigenVectors[i][i] = 1.0;
		};
	for (count=0; (count <= limit) && (m < (n*(n-1)/2)); count++){
		m = 0; /* init number of rotations skipped during sweep */
		for (i = 0; i < (n-1); i++)
			for (j=(i+1); j<n; j++){
				rotn = TRUE;
				p = 0.5 * (CovarianceMatrix[i][j]+CovarianceMatrix[j][i]);
				q = CovarianceMatrix[i][i]-CovarianceMatrix[j][j];
				t = sqrt(4.0*p*p+q*q);
				if (fabs(t) < accuracy){ /* ie t = 0 */
					rotn = FALSE;
					}
				else {
					if (q >= 0.0){
						oki = ((fabs(CovarianceMatrix[i][i])+100.0*fabs(p) - fabs(CovarianceMatrix[i][i]))  < accuracy);
						okj = ((fabs(CovarianceMatrix[j][j])+100.0*fabs(p) - fabs(CovarianceMatrix[j][j]))  < accuracy);
						if (oki && okj) rotn = FALSE;
						else rotn = TRUE;

						if (rotn){
							c = sqrt((t+q)/(2.0*t));
							s=p/(t*c);
							};
						}
					else {
						rotn = TRUE;
						s = sqrt((t-q)/(2.0*t));
						if (p<0.0) s = -s;
						c=p/(t*s);
						};

					if ((1.0 + fabs(s) - 1.0) < accuracy) rotn = FALSE;

					if (rotn){
						/* perform rotation */
						for (k = 0; k < n; k++){
							q = CovarianceMatrix[i][k];
							CovarianceMatrix[i][k] = c * q + s * CovarianceMatrix[j][k];
							CovarianceMatrix[j][k] = -s *q+c*CovarianceMatrix[j][k];
							};
						for (k=0; k <n; k++){
							q = CovarianceMatrix[k][i];
							CovarianceMatrix[k][i] = c*q+s*CovarianceMatrix[k][j];
							CovarianceMatrix[k][j] = -s*q+c*CovarianceMatrix[k][j];
							r = EigenVectors[k][i];
							EigenVectors[k][i] = c*r+s*EigenVectors[k][j];
							EigenVectors[k][j] = -s*r+c*EigenVectors[k][j];
							};
						}
					else
						m=m+1;
				};
			};
	};
	if (count == limit) {
		printf("Eigen system may not have been calculated accurately. That is the algorithm has not converged. \n");
	};
}


/*****************************************************************************/
/*                  PROJECT HIDDEN VECTORS ONTO COMPONENTS                   */
/*****************************************************************************/

double
Project(HiddenVector, EigenVectorNumber) 
double	HiddenVector[MAXUNITS];
int	EigenVectorNumber;
{
	double	getlen();
	double	eiglength;
	double	proj;
	int	i;
	double	ss;

	eiglength = getlen(EigenVectorNumber);

	ss =0.0;
	for (i=0; i<MaximumColumn; i++) {
		ss += HiddenVector[i] * EigenVectors[i][EigenVectorNumber];
	}
	proj = ss / eiglength;
	return(proj);
}

double
getlen(EigenVectorNumber) 
    int	EigenVectorNumber;
{
    register int i;
    double	ss;
    double	length;

    ss = 0.0;
    for (i=0; i<MaximumColumn; i++) {
	    ss += EigenVectors[i][EigenVectorNumber] * EigenVectors[i][EigenVectorNumber];
    }
    length = sqrt(ss);
    return(length);
}

ProjectHiddenUnitPatterns()
{
	double	Number;
	int	col, i, j;
	double	HiddenVector[MAXUNITS];
	char	label[100];

	rewind(HiddenUnitFile);

	while (lookahead(HiddenUnitFile) != EOF){
		col = 0;
		while ((lookahead(HiddenUnitFile) != EOF) && (lookahead(HiddenUnitFile) != nl)) {
			if (fscanf(HiddenUnitFile, "%lg", &Number) != 1){
				printf("Error in the input file. Note if no input file was specified this means there is an error in the hidden units file.\n");
				exit(-1);
				};
			HiddenVector[col] = Number;
			col = col + 1;
			if (col >= (MAXUNITS-1)){
				printf("pca can handle a maximum of %d hidden units.\n", MAXUNITS);
				exit(-1);
				};
			/* soak up non newline whitespace */
			while (lookahead(HiddenUnitFile) == ' ' || lookahead(HiddenUnitFile) == '\t')
				getc(HiddenUnitFile);
		};
		if (lookahead(HiddenUnitFile) == nl){
			getc(HiddenUnitFile); /* soak up nl */
			if (Xcomponent != 999)
				printf("%g ", Project(HiddenVector, Xcomponent));
			if (Ycomponent != 999)
				printf("%g ", Project(HiddenVector, Ycomponent));
			if (Zcomponent != 999)
				printf("%g ", Project(HiddenVector, Zcomponent));
			if (LabelsFileFlag){
				if (fscanf(LabelsFile, "%[^\n]\n", label) == EOF){
					printf("\nNo more labels.\n");
					exit(-1);
					};
				printf("\"%s\"", label);
				};
			if (Xcomponent != 999 || Ycomponent != 999 || Zcomponent != 999 || LabelsFileFlag)
				printf("\n");
			}
		}

}

/*****************************************************************************/
/*                       PROCESS COMMAND LINE OPTIONS                        */
/*****************************************************************************/

ProcessCommandLineOptions(argc, argv)
int argc;
char **argv;
{
	extern char *optarg;
	extern int optind;
	extern int opterr;
	int c;

	opterr = 0;
	while ((c = getopt(argc, argv, "x:y:z:f:l:i:")) != -1)
	    switch (c) {
	    case 'f':
			HiddenUnitFileFlag++;
			sscanf(optarg, "%s", HiddenUnitFileName);
			break;
	    case 'i':
			InputFileFlag++;
			sscanf(optarg, "%s", InputFileName);
			break;
	    case 'l':
			LabelsFileFlag++;
			sscanf(optarg, "%s", LabelsFileName);
			break;
	    case 'x': 
			sscanf(optarg, "%d", &Xcomponent);
			break;
	    case 'y': 
			sscanf(optarg, "%d", &Ycomponent);
			break;
	    case 'z': 
			sscanf(optarg, "%d", &Zcomponent);
			break;
	    case '?':
			ErrorFlag++;
	    };

	if (ErrorFlag) {
	    (void) fprintf(stderr, "pca -f <datafile> -l <labelsfile> -i <inputfile> -x <xcomponent> -y <ycomponent> -z <zcomponent>\n");
	    exit (1);
	};

	if (!HiddenUnitFileFlag){
		printf("No hidden unit file was supplied.\n");
		exit(-1);
		};

	if (Ycomponent != 999 && Xcomponent == 999){
		printf("If Y component is specified X component must be also.\n");
		exit(-1);
		};

	if (Zcomponent != 999 && Ycomponent == 999){
		printf("If Z component is specified then Y component must be also.\n");
		exit(-1);
		};
	HiddenUnitFile = fopen(HiddenUnitFileName, "r");
	if (HiddenUnitFile == NULL){
		printf("%s does not exit.\n", HiddenUnitFileName);
		exit(-1);
		};

	if (LabelsFileFlag){
		LabelsFile = fopen(LabelsFileName, "r");
		if (LabelsFile == NULL){
			printf("%s does not exit.\n", LabelsFileName);
			exit(-1);
			};
		};
}

AreComponentsLegal()
/* return an error and exit if the components given after the -x -y or -z
flags are not between 0 and n-1 */
{
	if (Xcomponent != 999 && (Xcomponent < 0 || Xcomponent >= MaximumColumn)){
		printf("X component was not legal. Components should be between 0 and %d.\n", MaximumColumn-1);
		exit(-1);
		};
	if (Ycomponent != 999 && (Ycomponent < 0 || Ycomponent >= MaximumColumn)){
		printf("Y component was not legal. Components should be between 0 and %d.\n", MaximumColumn-1);
		exit(-1);
		};
	if (Zcomponent != 999 && (Zcomponent < 0 || Zcomponent >= MaximumColumn)){
		printf("Z component was not legal. Components should be between 0 and %d.\n", MaximumColumn-1);
		exit(-1);
		};

}

main(argc, argv)
int	argc;
char	*argv[];
{

	Initialize();

	ProcessCommandLineOptions(argc, argv);

	/* if the principal_components file exists then load those 
	   principal_components else calculate the principal_components */

	sprintf(EigenVectorsFileName, "%s.principal_components", HiddenUnitFileName);
	if (!access(EigenVectorsFileName, 3)){
		printf("Unable to access principal_components file (i.e. not allowed to create it.\n");
		exit(1);
		};
	if (errno != ENOENT)
		LoadEigenVectors();
	else {
		CalculateCovarianceMatrix();
		CalculateEigenSystem();
		SaveEigenVectors();
		SaveEigenValues();
		};

	AreComponentsLegal();

	/* if an alternative input file (other than the orginal hidden units
	   file has been given then project onto the units in this file. */

	if (InputFileFlag){
		HiddenUnitFile = fopen(InputFileName, "r");
		if (HiddenUnitFile == NULL){
			printf("Input file does not exist.\n");
			exit(-1);
			};
		};

	ProjectHiddenUnitPatterns();
}

