#!/usr/bin/perl

use strict;

# number of subs to examine in the trace
my $NUM_GROUPS  = 1500000;
# number of clusters to start off with (max)
my $NUM_CLUSTERS = 100;
# number of sets to output
my $NUM_SETS = 10;

# min size of initial groups
my $INIT_SIZE_MIN_THRESH = 18-8; # avg - stddev
# max size of initial groups
my $INIT_SIZE_MAX_THRESH = 18+8; # avg + stddev

# max amount of overlap on all existing sets for picking an initial cluster
my $INIT_OVERLAP_TOTAL_THRESH = 4;
# max amount of overlap on any one existing set for picking an initial cluster
my $INIT_OVERLAP_MAX_THRESH = 1;

# list of each sub group
my @subsets;

# stats
my $total    = 0;
my $count    = 0;
my $max_size = 0;
my $min_size = 1e100;

$| = 1;

##### load
print "loading: ";
open(IN, "<$ARGV[0]") || die $!;
while (<IN>) {
    if (/^\d+ sub \d+ \d+ (.*)$/) {
	my @nodes = split(/ ?1 node == int /, $1);
	shift @nodes; # first elt is null in split
	#foreach my $n (@nodes) {
	#    if (!$n) {
	#	print "$_: $n\n"; exit 0;
	#    }
	#}
	push @subsets, \@nodes;
	$total += scalar(@nodes);
	if (scalar(@nodes) > $max_size) {
	    $max_size = scalar(@nodes);
	}
	if (scalar(@nodes) < $min_size) {
	    $min_size = scalar(@nodes);
	}
	$count++;
	print "." if ($count % int($NUM_GROUPS/100) == 0);
    }

    last if $count > $NUM_GROUPS;
}
print "\n";

print "\nstats:\n";

my $avg_size = $total/$count;
my $stddev   = 0;

for (my $i=0; $i<@subsets; $i++) {
    $stddev += (scalar(@{$subsets[$i]})-$avg_size)**2;
}
$stddev = sqrt($stddev/$count);

print "avg: $avg_size $stddev\n";
print "min: $min_size\n";
print "max: $max_size\n";

@subsets = sort { -(scalar(@{$a}) <=> scalar(@{$b})) } @subsets;

# ( { nodes in region i }, ... )
my @regions;
# ( node => { region this node is in } )
my %nodemap;

my $count = 0;

##### init clusters
print "finding initial clusers: ";
for (my $i=0; $i<@subsets; $i++) {
    my @set = @{$subsets[$i]};
    if (scalar(@set) >= $INIT_SIZE_MIN_THRESH &&
	scalar(@set) <= $INIT_SIZE_MAX_THRESH) {
	my ($count_total, $count_max, $argmax) = overlap(\@set);
	if ($count_total <= $INIT_OVERLAP_TOTAL_THRESH &&
	    $count_max   <= $INIT_OVERLAP_MAX_THRESH) {
	    my $region = make_region(\@set);
	    push @regions, $region;
	    foreach my $elt (@set) {
		add_to_region($elt, $region);
	    }
	}
    }

    if (scalar(@regions) >= $NUM_CLUSTERS) {
	last;
    }

    print "." if ($count++ % int($NUM_GROUPS/100) == 0);
}
print "\n";

print "\ninit_regions:\n";
print_regions();

my $count = 0;

##### grow
print "\ngrowing clusters: ";
for (my $i=0; $i<@subsets; $i++) {
    print "." if ($count++ % int($NUM_GROUPS/100) == 0);

    my @set = @{$subsets[$i]};
    my ($count_total, $count_max, $argmax) = overlap(\@set);

    next if @set == 0;

    if ($count_max == 0) {
	# doesn't overlap any region... probably won't happen
	# and if it does this is some tiny crevace somewhere which
	# no one ever goes to... add it as a new cluster
	my $region = make_region(\@set);
	push @regions, $region;
	foreach my $elt (@set) {
	    add_to_region($elt, $region);
	}
	next;
    }

    # greedily grow a region by adding a subset to it if it matches the
    # max number of existing nodes in that region (but do not modify 
    # existing regions.
    foreach my $elt (@set) {
	if (!defined $nodemap{$elt}) {
	    add_to_region($elt, $argmax);
	}
    }
}
print "\n";

print "\nfinal_regions:\n";
print_regions();

my @assignments;
for (my $i=0; $i<$NUM_SETS; $i++) {
    $assignments[$i] = [];
}

##### assign to servers
print "\nassigning regions: ";
for (my $i=0; $i<@regions; $i++) {
    my $index = argmin(\@assignments);
    my @nodes = keys %{$regions[$i]};
    $assignments[$index] = [@{$assignments[$index]}, @nodes];
}

$count = 0;

foreach my $a (@assignments) {
    my @elts = @{$a};
    my $size = scalar(@elts);
    print "$count\[$size\]: "; $count++;
    foreach my $elt (@elts) {
	print "$elt ";
    }
    print "\n";
}

open(OUT, ">region.map") || die $!;
$count = 0;
foreach my $a (@assignments) {
    my @elts = @{$a};
    foreach my $elt (@elts) {
	print OUT "$elt $count\n";
    }
    $count++;
}
close(OUT);

###############################################################################

# converts array into hash
sub make_region {
    my @set = @{shift(@_)};
    my %hash;
    foreach my $elt (@set) {
	$hash{$elt} = 1;
    }

    return \%hash;
}

# ( number of nodes in @set that we already have mapped to a region,
#   max mapped to the same region, the region the max mapped to )
sub overlap {
    my @set = @{shift(@_)};
    my $count  = 0;
    my %counts = ();
    my $max    = 0;
    my $argmax = undef;
    foreach my $elt (@set) {
	if (defined $nodemap{$elt}) {
	    $count++;
	    if (++$counts{$nodemap{$elt}} > $max) {
		$max = $counts{$nodemap{$elt}};
		$argmax = $nodemap{$elt};
	    }
	}
    }
    return ( $count, $max, $argmax );
}

# add or move $node to @region
sub add_to_region {
    my $node = shift;
    my $region = shift;

    if (defined $nodemap{$node}) {
	delete $nodemap{$node}->{$node};
    }
    $nodemap{$node} = $region;
    if (! $region->{$node} ) {
	$region->{$node} = 1;
    }
}

# get the index with min array size
sub argmin {
    my @arrays = @{shift(@_)};
    my $min = 1e100;
    my $argmin = -1;
    for (my $i=0; $i<@arrays; $i++) {
	my $size = scalar( @{$arrays[$i]} );
	if ($size < $min) {
	    $min = $size;
	    $argmin = $i;
	}
    }

    return $argmin;
}

# print the current region clusters
sub print_regions {
    my $count = 0;
    foreach my $region (@regions) {
	my @elts = keys %{$region};
	my $size = scalar(@elts);
	print "$count\[$size\]: "; $count++;
	foreach my $elt (@elts) {
	    print "$elt ";
	}
	print "\n";
    }
}
