
/****************************************************************/
/* Copyright 1993 : Johns Hopkins University			*/
/*                  Department of Computer Science		*/
/****************************************************************/
/* Contact : murthy@cs.jhu.edu					*/
/****************************************************************/
/* File Name : impurity_measures.c				*/
/* Author : Sreerama K. Murthy					*/
/* Last modified : October 1993					*/
/* Contains modules :	maxminority				*/
/*			summinority				*/
/*			sum_of_impurities			*/
/*			info_gain				*/
/*			gini_index				*/
/*			twoing					*/
/* Uses modules in : 	compute_impurity.c			*/
/*			oc1.h					*/
/* Is used by modules in :	compute_impurity.c		*/
/* Remarks       :	If the user wants to "plug-in" a new	*/
/*			impurity measure, that function should	*/
/*			be written on the lines of the functions*/
/*			in this file. It is also advisable that	*/
/*			such new functions be concatenated to	*/
/*			this file, so that the user needn't 	*/
/*			bother with the global declarations and	*/
/*			changing the makefile.			*/
/****************************************************************/		

#include "oc1.h"

extern int no_of_dimensions;
extern int *left_count,*right_count;
extern int no_of_categories;

int largest_element();


/************************************************************************/
/* Module name :	maxminority					*/ 
/* Functionality :	Computes the impurity of the current hypeplane	*/
/*			location. 				 	*/
/* Parameters : none.							*/
/* Returns :	larger of the minority values on the left and right.	*/
/* 		(minority on a side = sum of the counts of all classes	*/
/*		except the class with the highest count.)		*/
/* Calls modules :	largest_element (compute_impurity.c)		*/
/* Is called by modules :	compute_impurity (compute_impurity.c)	*/
/* Important Variables used :	left_count,right_count : arrays holding	*/
/*				the number of points of each category on*/
/*				the left and right of the hyperplane	*/
/*				under consideration, respectively.	*/
/*				These implicitly define the location of	*/
/*				the hyperplane.				*/
/* Remarks :	It is assumed that "left_count" and "right_count" are	*/
/*		set properly before this module is invoked.		*/
/************************************************************************/
float maxminority()
{
 int i,j,lminor=0,rminor=0;

 i = largest_element(left_count,no_of_categories);
 if (i <= no_of_categories)
  {
   for (j=1;j<=no_of_categories ;j++) 
     if (i != j) lminor += left_count[j];
  }


 i = largest_element(right_count,no_of_categories);
 if (i <= no_of_categories)
  {
   for (j=1;j<=no_of_categories ;j++) 
     if (i != j) rminor += right_count[j];
  }

 if (lminor > rminor) 
    return((float)lminor); 
 else return((float)rminor);
}

/************************************************************************/
/* Module name :	summinority					*/ 
/* Functionality :	Computes the impurity of the current hypeplane	*/
/*			location. 				 	*/
/* Parameters : none.							*/
/* Returns :	sum of the minority values on the left and right.	*/
/* 		(minority on a side = sum of the counts of all classes	*/
/*		except the class with the highest count.)		*/
/* Calls modules :	largest_element (compute_impurity.c)		*/
/* Is called by modules :	compute_impurity (compute_impurity.c)	*/
/* Important Variables used :	left_count,right_count : arrays holding	*/
/*				the number of points of each category on*/
/*				the left and right of the hyperplane	*/
/*				under consideration, respectively.	*/
/*				These implicitly define the location of	*/
/*				the hyperplane.				*/
/* Remarks :	It is assumed that "left_count" and "right_count" are	*/
/*		set properly before this module is invoked.		*/
/************************************************************************/
float summinority()
{
 int i,j,lminor=0,rminor=0;

 i = largest_element(left_count,no_of_categories);
 if (i <= no_of_categories)
  {
   for (j=1;j<=no_of_categories ;j++) 
     if (i != j) lminor += left_count[j];
  }


 i = largest_element(right_count,no_of_categories);
 if (i <= no_of_categories)
  {
   for (j=1;j<=no_of_categories ;j++) 
     if (i != j) rminor += right_count[j];
  }


 return((float)(lminor+rminor)); 
}

/************************************************************************/
/* Module name :	sum_of_impurities				*/ 
/* Functionality :	Computes the impurity of the current hypeplane	*/
/*			location. 				 	*/
/* Parameters : none.							*/
/* Returns :	sum of the impurity values on the left and right.	*/
/* 		Impurity on a side = sigma  ((xi - avg)*(xi - avg))	*/
/*				     i=1..k				*/
/* 		where x1,..xk are the classes (categories) of the	*/
/*		points on that side of the hyperplane, and		*/
/*		avg = (x1+x2+..+xk)/k.					*/
/* Calls modules :	none.						*/
/* Is called by modules :	compute_impurity (compute_impurity.c)	*/
/* Important Variables used :	left_count,right_count : arrays holding	*/
/*				the number of points of each category on*/
/*				the left and right of the hyperplane	*/
/*				under consideration, respectively.	*/
/*				These implicitly define the location of	*/
/*				the hyperplane.				*/
/* Remarks :	It is assumed that "left_count" and "right_count" are	*/
/*		set properly before this module is invoked.		*/
/************************************************************************/
float sum_of_impurities()
{
 float lavg=0,ravg=0,lerror = 0,rerror = 0;
 int i,lsum1=0,rsum1=0,lsum2=0,rsum2=0;
 int *temp1=NULL,*temp2=NULL;
 static int si_compare();

 if (no_of_categories > 2)
   /* Renumber categories in descending order of their proportion of
      occurance. This removes the possibility for a biased impurity
      estimate. */
  {
    temp1 = ivector(1,no_of_categories);
    temp2 = ivector(1,no_of_categories);
    for (i=1;i<=no_of_categories;i++)
     {
      temp1[i] = left_count[i];
      temp2[i] = right_count[i];
     } 
    qsort((char *)(left_count+1),no_of_categories,sizeof(int),si_compare);
    qsort((char *)(right_count+1),no_of_categories,sizeof(int),si_compare);
  }
     
 for (i=1;i<=no_of_categories;i++)
  { lsum1 += left_count[i]; 
    lsum2 += i * left_count[i];
    rsum1 += right_count[i]; 
    rsum2 += i * right_count[i];
  }

 if (lsum1 != 0) lavg = (float)lsum2/lsum1;
 if (rsum1 != 0) ravg = (float)rsum2/rsum1;

 for (i=1;i<=no_of_categories;i++)
  {
   lerror += left_count[i] * (i - lavg) * (i - lavg);
   rerror += right_count[i] * (i - ravg) * (i - ravg);
  }

 if (no_of_categories > 2)
    /* Restore original left_count and right_count arrays.
       Remember, they are read_only. */
  {
    for (i=1;i<=no_of_categories;i++)
     {
      left_count[i] = temp1[i];
      right_count[i] = temp2[i];
     } 
    free_ivector(temp1,1,no_of_categories);
    free_ivector(temp2,1,no_of_categories);
  }

 return ((float)(lerror+rerror));

}

/************************************************************************/
static int si_compare(p1,p2)
int *p1,*p2;
{
 int p;

 p = *p1-*p2;
 return(p);
}

/************************************************************************/
/* Module name : info_gain						*/ 
/* Functionality :	Computes the (Quinlan's) information gain of 	*/
/*			the current split. As OC1 tries to minimize the	*/
/*			"impurity" instead of maximizing the information*/
/*			"gain", this module returns the reciprocal of 	*/
/*			the computed gain.				*/
/* Parameters : None.							*/
/* Returns : 	Reciprocal of the information gain.			*/
/* Calls modules :	None.						*/
/* Is called by modules :	compute_impurity (compute_impurity.c)	*/
/* Important Variables used :	left_count,right_count : arrays holding	*/
/*				the number of points of each category on*/
/*				the left and right of the hyperplane	*/
/*				under consideration, respectively.	*/
/*				These implicitly define the location of	*/
/*				the hyperplane.				*/
/* Remarks :	It is assumed that "left_count" and "right_count" are	*/
/*		set properly before this module is invoked.		*/
/************************************************************************/
float info_gain()
{
 float presplit_info=0,postsplit_info=0,left_info=0,right_info=0;
 float ratio,infogain;
 int i,total_count=0,total_left_count=0,total_right_count=0;
 double log2();

 for (i = 1;i<=no_of_categories;i++) 
  {
   total_left_count += left_count[i];
   total_right_count += right_count[i];
  }
 total_count = total_left_count + total_right_count;


 if (total_count)
  for (i = 1;i<=no_of_categories;i++)
  {
   ratio = (float)(left_count[i]+right_count[i])/total_count;
   if (ratio) presplit_info += -1.0 * ratio * (float)log2((double)ratio);
  }

 if (total_left_count)
 {
  for (i = 1;i<=no_of_categories;i++)
  {
   ratio = (float)left_count[i]/total_left_count;
   if (ratio) left_info += -1.0 * ratio * (float)log2((double)ratio);
  }
  postsplit_info += total_left_count * left_info / total_count;
 }

 if (total_right_count)
 {
  for (i = 1;i<=no_of_categories;i++)
  {
   ratio = (float)right_count[i]/total_right_count;
   if (ratio) right_info += -1.0 * ratio * (float)log2((double)ratio);
  }
  postsplit_info += total_right_count * right_info / total_count;
 }

 infogain = presplit_info - postsplit_info;

 if (infogain == 0) /*No information gained due to this split.
                      i.e., Either the region is homogenous or impurity 
                      is as large as it can be. */
   {
    for (i=1;i<=no_of_categories;i++)
      if (left_count[i] + right_count[i] == total_count) return(0);
    return(HUGE);
   }
 else return(1.0/infogain);
}

/************************************************************************/
/* Module name : gini_index						*/ 
/* Functionality :	Computes the gini_index of a node. This stati-	*/
/*			stical measure was described in the context of 	*/
/*			decision trees by Leo Breiman et al in the CART	*/
/*			book.						*/
/* Parameters : None.							*/
/* Returns :	A floating point number, representing the impurity of	*/
/*		a node.							*/
/* Calls modules : None.						*/
/* Is called by modules :	compute_impurity (compute_impurity.c)	*/
/************************************************************************/
float gini_index()
{
 float total_left_count=0,total_right_count=0;
 float total_left_count2,total_right_count2;
 float gini_left=0,gini_right=0;
 int i,j;

 for (i=1;i<=no_of_categories;i++)
  {
   total_left_count += left_count[i];
   total_right_count += right_count[i];
  }

 total_left_count2 = total_left_count * total_left_count;
 total_right_count2 = total_right_count * total_right_count;

 if (total_left_count2)
   for (i=1;i<no_of_categories;i++)
    for (j=i+1;j<=no_of_categories;j++)
      gini_left += left_count[i]*left_count[j]/total_left_count2;

 if (total_right_count2)
   for (i=1;i<no_of_categories;i++)
    for (j=i+1;j<=no_of_categories;j++)
      gini_right += right_count[i]*right_count[j]/total_right_count2;

 return(gini_left+gini_right);
}
 
/************************************************************************/
/* Module name : twoing							*/ 
/* Functionality :	Computes, by twoing rule, the goodness of a 	*/
/*			node. This measure is described in detail  	*/
/*			by Leo Breiman et al in the CART book.		*/
/* Parameters : None.							*/
/* Returns :	A floating point number, representing the impurity of	*/
/*		a node.							*/
/* Calls modules : None.						*/
/* Is called by modules :	compute_impurity (compute_impurity.c)	*/
/* Remarks : 	As twoing rule, in a sense, measures the goodness of	*/
/*		a split, this routine returns the inverse of the	*/
/*		calculated number.					*/
/************************************************************************/
float twoing()
{
 float total_left_count=0,total_right_count=0,total_count;
 float goodness=0,temp;
 int i;

 for (i=1;i<=no_of_categories;i++)
  {
   total_left_count += left_count[i];
   total_right_count += right_count[i];
  }

 total_count = total_left_count + total_right_count;
 if (!total_count) return(HUGE);

 for (i=1;i<=no_of_categories;i++)
  {
   temp = 0;
   if (total_left_count) temp = left_count[i]/total_left_count;
   if (total_right_count) temp -= right_count[i]/total_right_count;

   if (temp < 0) goodness += -1.0 * temp;
   else goodness += temp;
  }

 total_left_count /= total_count;
 total_right_count /= total_count;

 return( 4 / (total_left_count * total_right_count * goodness * goodness));
} 
   
/************************************************************************/
/************************************************************************/
