#!/usr/local/bin/perl5 -w
#
##########################################################################
# $Id: kMeans.pm,v 1.10 2004/07/09 15:31:06 lw2j Exp $
##########################################################################
# $Log:	kMeans.pm,v $
# Revision 1.10  2004/07/09  15:31:06  lw2j
# Fixed batch updates so non-changing values get handled correctly.
# 
# Revision 1.2  2004/01/09  17:14:55  lw2j
# Should now automatically randomly reseed vanished components.
#
# Revision 1.1  2004/01/08  14:28:45  lw2j
# Initial revision
#
##########################################################################
#
# Purpose:
#   To provide a simple method for multivariate k-Means clustering.
#
##########################################################################
#
# Package Header
#
##########################################################################

package kMeans;
use strict;
use lib '/usr/lw2j/private/Stat/Perl';

require ArrayFoo;
require Exporter;
require Heap;
require util;



##########################################################################

BEGIN {
  use Exporter ();
  use Symbol;
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

  $VERSION     = 1.0;
  @ISA         = qw(Exporter);
  @EXPORT      = qw();
  %EXPORT_TAGS = qw();
  @EXPORT_OK   = qw(
    new
    cluster

    set_k
    load_data
    load_data_shallow

    init_means_random
    init_means

    k_iterate
  );

  return 1;
}


sub new($@) {
  my $class = shift;
  my $self  = +{};

  bless $self, $class;

  return $self;
}



# Given data, cluster it using k-means.  A self parameter is
# not mandatory.  'data' should be a reference to an array of
# either scalars (univariate only) or refs to arrays of equal
# length (multidimensional).  'k' is number of clusters to use.
#
# Return value will be two list refs:
#  means (k tuples, each a data instance; either scalar or list ref)
#  assignments (one cluster index per datum; 0..(k-1))
#  covariance matrices
sub cluster($$;$) {
  my $self = shift;
  my $data = shift;
  my $k    = shift;

  if ((ref($self) ne 'kMeans') && (!defined($k))) {
    $k    = $data;
    $data = $self;
    $self = kMeans::new kMeans();
  }

  $self->{'k'} = $k;

  # Not safe for multithreaded use.
  $self->load_data_shallow($data);

  if (!defined($self->init_random())) {
    return undef;
  }

  if (!defined($self->k_iterate())) {
    return undef;
  }

  my $means       = util::deep_copy($self->{'means'});
  my $assignments = util::deep_copy($self->{'assignments'});
  my $covariances = util::deep_copy($self->{'covariances'});

  return ($means, $assignments, $covariances);
}



# If you want to bypass 'cluster', go ahead.
sub set_k($$) {
  my $self = shift;
  my $k    = shift;

  # Don't be absurd.
  ($k == int($k)) || die;

  # Use at least 2...
  ($k > 1) || die;

  $self->{'k'} = $k;
}


# Makes a COPY.
sub load_data($$) {
  my $self = shift;
  my $data = shift;

  my $n = scalar (@$data);
  ($n > 0) || die;
  $self->{'n'} = $n;

  $self->{'data'} = util::deep_copy($data);

  my $d = undef;

  if (ref($data->[0])) {
    $d = scalar @{$data->[0]};
  } else {
    $d = 1;
  }

  $self->{'d'} = $d;
  ($d > 0) || die;
}



# Shallow copy -- duplicate the reference only.
sub load_data_shallow($$) {
  my $self = shift;
  my $data = shift;

  my $n = scalar (@$data);
  ($n > 0) || die;
  $self->{'n'} = $n;

  $self->{'data'} = $data;

  my $d = undef;
  if (ref($data->[0])) {
    $d = scalar @{$data->[0]};
  } else {
    $d = 1;
  }

  $self->{'d'} = $d;
  ($d > 0) || die;
}



# Initialize to 'k' random data items without replacement.
# Data must have been already loaded, k must have been set.
sub init_random($) {
  my $self = shift;

  my $k = $self->{'k'};
  my $data = $self->{'data'};

  defined($k) || die;
  defined($data) || die;

  my %picked = ();
  my $means  = +[];
  my $n      = $self->{'n'};
  my $d      = $self->{'d'};

  ($n == scalar(@$data)) || die;

  # Don't be absurd.  Well, $n == $means is also silly, but possible.
  ($n >= $k) || die;

  my $minheap = Heap::new Heap();

  for (my $i=0; $i < $n; $i++) {
    $minheap->insert(rand(), $i);
  }

  my $prev_mean = undef;

  find_mean:
  for (my $m_idx=0; $m_idx < $k; $m_idx++) {
    my ($pri, $idx) = $minheap->remove();

    if (!(defined($pri) && defined($idx))) {
      return undef;
    }

    my $value = $data->[$idx];

    defined($value) || die;

    # ensure uniqueness
    foreach (my $other_m=0; $other_m < $m_idx; $other_m++) {
      my $dist = 0;

      if ($d == 1) {
        $dist = abs($means->[$other_m] - $value);
      } else {
        for (my $j=0; $j < $d; $j++) {
          $dist += ($means->[$other_m]->[$j] - $value->[$j])**2;
        }
        $dist = sqrt($dist);
      }

      if ($dist < 1e-8) {
        redo find_mean;
      }
    }


    push @$means, util::deep_copy($value);
  }

  (scalar(@$means) == $k) || die;
  $self->{'means'} = $means;
}



# Do so manually.  Either give it a list reference containing the means,
# or list 'em outright.
sub init_means($@) {
  my $self  = shift;
  my $k     = $self->{'k'};
  my @means = @_;

  if (scalar(@means) == 1) {
    # Note: k >= 2.
    @means = @{$means[0]};
  }

  (scalar(@means) == $k) || die;
  $self->{"means"} = +[ util::deep_copy(@means) ];
}



# data, k, means must have already been loaded.
#
# Iterate until assignments converge.
#
# Return list references giving means, assignments, and
# cluster [co]variances.

sub k_iterate($) {
  my $self = shift;
  my $data = $self->{'data'};
  my $d    = $self->{'d'};
  my $n    = $self->{'n'};
  my $k    = $self->{'k'};
  my $means       = $self->{'means'};
  my $assignments = +[ (undef) x $n ];
  my $covariances = +[];

  loop:  {
    my $changed = 0;

    my $new_assignments = util::deep_copy($assignments);

    # Compute assignments.
    for (my $idx=0; $idx < $n; $idx++) {
      my $datum   = $data->[$idx];
      my $dist_best    = undef;
      my $cluster_best = undef;
      my $old_assignment = $assignments->[$idx];

      for (my $c_idx=0; $c_idx < $k; $c_idx++) {
        my $mean = $means->[$c_idx];
        my $dist = undef;
        defined($mean) || next;

        # Euclidean distance.  Do your scaling beforehand, or edit this
        # module to use a different function.

        if (!ref($datum)) {
          $dist = abs($datum - $mean);
        } else {
          my $sum = 0;

          map {
            $sum += ($datum->[$_] - $mean->[$_]) ** 2
          } (0..($d-1));

          $dist = sqrt($sum);
        }

        if ((!defined($dist_best)) || ($dist < $dist_best)) {
          $dist_best    = $dist;
          $cluster_best = $c_idx;
        }
      }


      defined($cluster_best) || die;

      $new_assignments->[$idx] = $cluster_best;
      if ((!defined($old_assignment)) || ($old_assignment != $cluster_best)) {
        # New assignment.
        $changed = 1;
      }

    }

    if (!$changed) {
      last loop;
    }



    {
      # Changed.  Use new means.  Prepare new array.  Batch update.
      $assignments = $new_assignments;

      my @counts = (0) x $k;

      if ($d == 1) {
        my @sums = (0) x $k;
        $means       = +[(0) x $k];

        for (my $i=0; $i < $n; $i++) {
          my $c_idx = $assignments->[$i];
          my $val   = $data->[$i];

          $sums[$c_idx]+=$val;
          $counts[$c_idx]++;
        }

        for (my $c_idx=0; $c_idx < $k; $c_idx++) {
          if ($counts[$c_idx] > 0) {
            $means->[$c_idx] = $sums[$c_idx]/$counts[$c_idx];
          } else {
            return undef;
          }
        }
      } else {
        my @sums = ();
        $means   = +[];

        for (1..$k) {
          push @sums, +[ (0) x $d ];
          push @$means, +[ (0) x $d ];
        }

        for (my $i=0; $i < $n; $i++) {
          my $c_idx = $assignments->[$i];
          my $val   = $data->[$i];

          foreach (0..($d-1)) {
            $sums[$c_idx]->[$_] +=$val->[$_];
          }

          $counts[$c_idx]++;
        }

        for (my $c_idx=0; $c_idx < $k; $c_idx++) {
          if ($counts[$c_idx] > 0) {
            foreach (0..($d-1)) {
              $means->[$c_idx]->[$_] = $sums[$c_idx]->[$_]/$counts[$c_idx];
            }
          } else {
            return undef;
          }
        }
      }
    }
    redo loop;
  }



  {
    # Compute covariance matrices.  First, partition data according
    # to assignments.

    my @clusters = ();

    foreach (1..$k) {
      push @clusters, +[];
    }

    for (my $i=0; $i < $n; $i++) {
      my $datum = $data->[$i];
      my $assign = $assignments->[$i];

      push @{$clusters[$assign]}, $datum;
    }

    for (my $i=0; $i < $k; $i++) {
      my @cluster = @{$clusters[$i]};
      my $nc      = scalar @cluster;

      if ($nc == 0) {
        die;
      } elsif ($nc == 1) {
        if ($d == 1) {
          $covariances->[$i] = 1e-4;
        } else {
          $covariances->[$i] = Numerical::matrix_new($d, $d);
          for (0..($d-1)) {
            $covariances->[$i]->[$_]->[$_] = 1e-4;
          }
        }
      } else {
        my ($mean, $covariance) = SEM::normal_est(\@cluster);
        defined($mean) || die;
        if ($d == 1) {
          # Sanity check:  mean should match.
          (abs($mean - $means->[$i]) <= 1e-5) || die;
          $covariances->[$i] = $covariance;
        } else {
          # Verify mean again.
          for (0..($d-1)) {
            (abs($mean->[$_] - $means->[$i]->[$_]) <= 1e-5) || die;
          }
          $covariances->[$i] = $covariance;
        }
      }
    }
  }

  $self->{'means'}       = $means;
  $self->{'assignments'} = $assignments;
  $self->{'covariances'} = $covariances;

  return ($means, $assignments, $covariances);
}


return 1;
