#!/usr/bin/perl

BEGIN {
# cmu machines are stupid and still use ssh v1
    $ENV{TRAVERTINE_SSHVERSION} = "1,2";
    #$ENV{TRAVERTINE_PRINTCMDS} = 1;
}

use strict;
use Getopt::Std;
use vars qw($opt_v $opt_x $opt_g $opt_o $opt_t $opt_l $opt_T $opt_I $opt_N $opt_c $opt_P $opt_E $opt_C $opt_d $opt_k $opt_s $opt_S);
use lib "./iris";
use Travertine;
use LocalConf;

our $MAX_PARALLEL = 20;

getopts("v:t:T:o:xgI:cPE:CdklsBS:");

our $VSERVERS = $opt_v || 1;

## where should this be, really?? we need it here for "sleeping" 
## for the appropriate amount of time
our $TIMELIM  = 30 * 24 * 3600;  # huge time;
our $TIMELIMM = $TIMELIM*1000;  # in msec
our $WAITTIME = defined $opt_T ? $opt_T  : 600;         # wait max 10 mins by default
our $IGNORES   = defined $opt_I ? $opt_I : "";
our $DISABLE_CACHING = defined $opt_c ? 1 : 0;          # enabled by default;
our $PAUSE_FOREVER = $opt_P;
our $DELETE_OLD_LOGS = !defined $opt_d;
our $CHECK_ONLY = defined $opt_C;
our $KILL = defined $opt_k;
our $GET_LOGS = defined $opt_l;
our $STATUS_LOGIN = defined $opt_S ? $opt_S : "$USERNAME\@gs203.sp.cs.cmu.edu:~";

our $SAVEDIR = $opt_o || "$PUSHLOGHOST:$PUSHLOGDIR";

if (@ARGV < 2) {
    print STDERR "usage: LocalRun.pl [options] <app_config> <num_nodes> [app_options]\n\n";
    print STDERR "       ([app_options] are transparently passed to RunV2.pl)\n";
    print STDERR "       \n";
    print STDERR "       NOTE!!!! -t <time> and the timelimit for the actual app\n";
    print STDERR "                have to be correlated!!!!!!!\n";
    print STDERR "       -x       invoke xterms\n";
    print STDERR "       -g       invoke gterm\n";
    print STDERR "       -P       pause forever after execution\n";
    print STDERR "       -t       time to run for (seconds)\n";
    print STDERR "       -T       time to wait for everybody to join\n";
    print STDERR "       -v num   number of virtual servers per node\n";
    print STDERR "       -o login [user\@]host:/dir to push logs to\n";
    print STDERR "       -I list  comma-separated list of to-be ignored nodes\n";
    print STDERR "       -c       DISABLE caching (default enabled)\n";
    print STDERR "       -E       mode {gdb, valgrind, valgrindmem}\n";
    print STDERR "       -d       don't delete any old logs on remote mach before rsync\n";
    print STDERR "       -C       only check the status of the experiments\n";
    print STDERR "       -s       run the program with sudo\n";
    print STDERR "       -k       kill experiment\n";
    print STDERR "       -l       retrieve logs\n";
    exit 1;
}

our $app_config = shift @ARGV;
our $num_nodes = shift @ARGV;
our $app_args = join " ", @ARGV;

die if (!defined $app_config or !defined $num_nodes);

our $args = " -t $TOPDIR -l $LOGDIR -L -v $VSERVERS -P $MAX_PARALLEL $app_args";
$args .= " --mercports $MERCPORT --termpass '$TERMPASS' ";

if ($DISABLE_CACHING) { 
    $args .= " -c ";
}

if ($opt_x) { $args .= " -x "; }
if ($opt_g) { $args .= " -g "; }
if ($opt_E) { $args .= " -E '$opt_E' "; }
if ($opt_s) { $args .= " -s "; }

our @hosts = GetHosts();

if (@hosts < $num_nodes) {
    tdie "not enough hosts in host file! (" . (scalar @hosts) . " < $num_nodes)";
}

our @logins;
our @logins_plain;

my %IGNORES;
map { $IGNORES{$_} = 1 } split(/,/, $IGNORES);

for (my $i=0; $i<@hosts; $i++) {
    if ($IGNORES{$hosts[$i]}) {
	next;
    }

    push @logins, "$USERNAME\@$hosts[$i]:$hosts[$i]";
    push @logins_plain, ["$USERNAME", "$hosts[$i]"];

    last if (@logins >= $num_nodes) && !$KILL;
}

my ($login, $host, $dir) = ($SAVEDIR =~ /^(\w+)\@([^:]+):(.*)$/);
if (!$login || !$host || !$dir) {
	tdie "invalid savedir: $SAVEDIR";
}
rsystem($login, $host, sub {
    my $dir = shift;
    psystem ("mkdir -p $dir");
}, $dir);

###############################################################################

if ($CHECK_ONLY) {
    CheckStatus();
    exit 0;
}
if ($KILL) {
    tinfo "* killing experiment";
    psystem ("./RunV2.pl $app_config $args -k " . join (" ", @logins));
    if ($GET_LOGS) {
	goto getlogs;
    }
    else {
	exit 0;
    }
}

###############################################################################

my $exp = "/tmp/iris.exp";

## Do preconfiguration
#tinfo "* starting auxiliary functions on emulab";
#ParallelExec2(sub {
#    rsystem($_[0], $_[1], sub {
#	psystem("sudo /home/jeffpang/kern_recv_bump.sh 200000");
#	psystem("sudo killall time-resync.sh >/dev/null 2>&1");
#	psystem("sudo /home/jeffpang/time-resync.sh >/dev/null 2>&1 &");
#    });
#}, @logins_plain);

## Start the experiment
tinfo "* starting experiment";
tinfo("./RunV2.pl $app_config $args " . join(" ", @logins) . " > $exp");
psystem("./RunV2.pl $app_config $args " . join(" ", @logins) . " > $exp");

## Wait for experiment to complete
tinfo "* experiment file is: $exp";
tinfo "* sleeping for 5 seconds";
sleep (5);

my $min = 0;
my $interval = 60;

# keep monitoring the status of the experiment 
while (1) { 
    my $err = "/tmp/status.err";
    my $htm = "/tmp/status.html";
    my $baseport = $MERCPORT + 10000;
    
    psystem ("perl PollServerStats.pl -e $exp -P $TERMPASS -p $baseport -m 16 >$htm 2>$err");
    tdie "pollserver did not succeed" if (!-f $err or !-f $htm);
    psystem ("cat $err");
    psystem ("rsync -e ssh $htm $STATUS_LOGIN");

    if (`grep yes $err` eq "") {
	twarn " ALL SERVERS HAVE DIED!! ";
	last;
    }
    sleep ($interval);
    $min += ($interval/60);
    tinfo sprintf ("* %.2f minutes elapsed", $min);
}

cleanup:
## Stop the experiment
# make sure remote nodes are dead...
if ($PAUSE_FOREVER) {
    sleep(1000000000);
} else {
    tinfo "* killing experiment";
    psystem("./RunV2.pl $app_config $args -k " . join(" ", @logins));
}
unlink $exp;

## Shutdown auxiliary functions on emulab
#tinfo "* stopping auxiliary functions on emulab";
#ParallelExec2(sub {
#    rsystem($_[0], $_[1], sub {
#	psystem("sudo killall time-resync.sh >/dev/null 2>&1");
#    });
#}, @logins_plain);

## Checking time synchronization
#tinfo "* checking time synchronization";
#psystem("./emulab/EmulabTimeTest.pl -E $exp_name -n $num_nodes > /tmp/TimeSyncInfo.out");

## Collect the log files
getlogs:
tinfo "* collecting log files to local machine";
if ($DELETE_OLD_LOGS) {
    rsystem($login, $host, sub {
	my $dir = shift;
	my @logs = glob("$dir/*");
	unlink @logs;
    }, $dir);
}

#psystem("rsync -azb -e ssh -v /tmp/TimeSyncInfo.out $SAVEDIR 1>&2");
#unlink "/tmp/TimeSyncInfo.out";

# don't do more than this many rsyncs at once
my $MAX_PARALLEL_RSYNCS = 10;

for (my $i=0; $i<@logins; $i += $MAX_PARALLEL_RSYNCS) {
    my $begin = $i;
    my $end   = $i+$MAX_PARALLEL_RSYNCS-1 > $#logins ? $#logins : $i+$MAX_PARALLEL_RSYNCS-1;

    ParallelExec2(sub {
	my $pref = shift;
	$pref =~ s/:.*$//;
	my ($user, $host) = split(/@/, $pref);
	
	rsystem($user, $host, sub {
	    my ($pref, $LOGDIR, $SAVEDIR) = @_;
	    my $stat = psystem("rsync -v -e ssh -azb $LOGDIR/* $SAVEDIR 1>&2");
	    if ($stat) {
		twarn "rsync of $pref:$LOGDIR failed! not deleting logs!";
	    } else {
		# now delete them
		psystem("rm -f $LOGDIR/*");
	    }
	}, $pref, $LOGDIR, $SAVEDIR);

    }, @logins[$begin..$end]);
}

sleep(5);

## Checking exp status
tinfo "* checking exp status...";
CheckStatus();

###############################################################################

sub CheckStatus() {

    ## Check that it succeeded
    my $ret = rsystem($login, $host, sub {
	my $dir = shift;
	my @logs = glob("$dir/OutputLog.*");

	if (scalar(@logs) < ($num_nodes - scalar(keys %IGNORES))*$VSERVERS) {
	    print "expected=", ($num_nodes - scalar(keys %IGNORES))*$VSERVERS;
            print " found=", scalar(@logs), "\n";
	    return -1;
	}

	my $failed = 0;
	foreach my $log (@logs) {
	    next if $log =~ /bootstrap/;

	    my $out = `tail -1000 $log`;
	    if ($out !~ /\* hit timelimit\s*\! going home/) {
	        $failed++;
	    }
	}
	return $failed;
    }, $dir);
    
    if ($ret == 0) {
	print "ok\n";
    } elsif ($ret == -1) {
	print "error: mising logs\n";
    } else { 
	print "error: $ret servers didn't finish\n";
    }
}

###############################################################################
