#!/usr/bin/perl

BEGIN {
# cmu machines are stupid and still use ssh v1
    $ENV{TRAVERTINE_SSHVERSION} = "2,1";
    #$ENV{TRAVERTINE_PRINTCMDS} = 1;
}

use strict;
use Getopt::Std;
use vars qw($opt_T $opt_I $opt_k $opt_o $opt_R);
use lib "./planetlab";
use Travertine;
use PlanetLabConf;

getopts("R:o:T:I:k");

our $MAX_PARALLEL = 10;
our $LOGROTATELINES = 1000000;
our $SKIP_BOOTSTRAP_CHECK = 1;

## where should this be, really?? we need it here for "sleeping" 
## for the appropriate amount of time
our $WAITTIME = defined $opt_T ? $opt_T  : 600;
our $IGNORES  = defined $opt_I ? $opt_I : "";
our $KILL = defined $opt_k;

if (@ARGV < 1) {
    print STDERR "usage: PlanetLabRun.pl [options] <app_config> [app_options]\n\n";
    print STDERR "       ([app_options] are transparently passed to RunV2.pl)\n";
    print STDERR "       \n";
    print STDERR "       -o file  experiment file (default /tmp/plab.exp)\n";
    print STDERR "       -T       time to wait for everybody to join\n";
    print STDERR "       -I list  comma-separated list of to-be ignored nodes\n";
    print STDERR "       -R list  comma-separated list of nodes todo (takes place of hosts.txt (reboot)\n";
    print STDERR "       -k       kill experiment\n";
    exit 1;
}

our $app_config = shift @ARGV;
our $app_args = join " ", @ARGV;

die if (!defined $app_config);

our $args = " -B -t $TOPDIR -l $LOGDIR -L -A -v 1 -P $MAX_PARALLEL " .
    " --mercports $MERCPORT --termpass '$TERMPASS' --outlogrot $LOGROTATELINES $app_args";

our @hosts;

if (!$opt_R) {
    @hosts = GetHosts();
    unshift @hosts, $BOOTSTRAP;
} else {
    $SKIP_BOOTSTRAP_CHECK = 1;
    @hosts = split(/,/, $opt_R);
}
my $num_nodes = @hosts;

if (!@hosts) {
    tdie "not enough hosts! (" . (scalar @hosts) . ")";
}

our @logins;
our @logins_plain;

my %IGNORES;
map { $IGNORES{$_} = 1 } split(/,/, $IGNORES);

for (my $i=0; $i<@hosts; $i++) {
    if ($IGNORES{$hosts[$i]}) {
	next;
    }

    push @logins, "$USERNAME\@$hosts[$i]:$hosts[$i]";
    push @logins_plain, ["$USERNAME", "$hosts[$i]"];
}

###############################################################################

if ($KILL) {
    tinfo "* killing experiment";
    psystem ("./RunV2.pl $app_config $args -k " . join (" ", @logins));
    exit 0;
}

###############################################################################

my $exp = $opt_o || "/tmp/plab.exp";

## Start the experiment
tinfo "* starting experiment";
tinfo("./RunV2.pl $app_config $args " . join(" ", @logins) . " > $exp");
psystem("./RunV2.pl $app_config $args " . join(" ", @logins) . " > $exp");

## Wait for experiment to start
tinfo "* experiment file is: $exp";

if (!$SKIP_BOOTSTRAP_CHECK) {
    my $ev = 1;
    my $period = 10;
    for (my $i = 0; $i < $WAITTIME; $i += $period) { 
	my $out = `ssh -n $USERNAME\@$hosts[0] LD_LIBRARY_PATH=$TOPDIR/Merc/ $TOPDIR/Merc/build/chkjoin2 --bsaddr localhost:15000 2>/dev/null`;
	chomp $out;

	if ($out =~ /all joined/) { 
	    $ev = 0;
	    last;
	}
	elsif ($out =~ /giving up/) {
	    twarn "looks like bootstrap is dead or stuck";
	    $ev++;
	    last if ($ev >= 10);
	}
	else {
	    tinfo "waiting for joining";
	}
	sleep ($period);
    }
    if ($ev != 0) { 
	twarn "ERROR in joining -- all nodes do not seem to have joined!";
	goto cleanup;
    }
    else {
	tinfo "all nodes joined SUCCESSFULLY! starting timer...";
    }
}

tinfo "* experiment started! exp file in $exp";
exit 0;

###############################################################################
