#!/usr/bin/perl
#
# XXX: this script is current BROKEN. DO NOT USE.
#
# Converts a topology file to a netem tc script, to be executed by root.
# Works under linux 2.6 (and newer 2.4). Assumes addresses in the topo file
# are IPs and not host names.
#
# usage: Topology2Netem.pl <topology_file> <local_ip>
#
# This uses the netem qdisc in linux 2.6 traffic control to emulate wide
# area latencies and end-host outbound bottleneck bandwith. It currently
# does not emulate inbound bwidth limitations. Each node can have multiple
# virtual servers which are associated with a set of ports. Each pair of
# vservers can have an unique end2end latency. 
#
# The tc hierarchy looks like this:
#
#            root 1:
#              |
#   ----------prio---------    (two-level prio filtered on sports)
#  /    |    |     ...     \
# v1    v2   v3            vm  (1 band per local vserver)
# .     .    |             .
# .     .   tbf            .   (tbf qdisc for outbound limit [optional])      
# .     .    |             .
#        ---prio--             (two-level prio filtered on real dest ip)
#       /   | ... \
#      h1   h2    hn           (1 band per real dest ip)
#      .    |     .
#      . --prio-- .            (two-level prio filtered on dports)
#       /  | ... \
#      v21 v22   v3n           (1 band per remote vserver)
#      .   |      .
#      .  netem   .            (netem qdisc for wan latency emulation)
#
# Below each prio, there is one additional priority band at the end for
# "everything else" which goes to a default pfifo qdisc.
#
# LIMITS:
#
# * There can not be more than 240 real nodes or more than 240 vservers on
#   a single node. (this is the max number of priorities that can be assigned
#   in two levels of a prio qdisc, with one upper-level prio left over for
#   the default case)
# * There can not be more than ~ 65000/vservers_per_node total vservers since
#   each vserver requires ~1+log(vservers_per_node)/vservers_per_node handles
#   to be allocated for qdiscs and there are only 2^16 bits for each handle. 
#
###############################################################################

use strict;
use Carp;
use Getopt::Std;
use POSIX qw(ceil);
use vars qw($opt_o $opt_b $opt_l $opt_F);

getopts("o:b:l:F");

our $FAIL_FAST = !$opt_F;

our $OUTBOUND_TBF_RATE    = defined $opt_o ? $opt_o : "1500kbps";
our $OUTBOUND_TBF_BURST   = defined $opt_b ? $opt_b : "128kb";
our $OUTBOUND_TBF_LATENCY = defined $opt_l ? $opt_l : "200ms";

our $TOPO_FILE = @ARGV[0];
our $LOCALADDR = @ARGV[1];

our $HACKY_BOOTSTRAP_ID = 99999;

croak "usage: Topology2Netem.pl <topology_file> <local_ip>" if !$TOPO_FILE || !$LOCALADDR;

our $DEV = get_dev($LOCALADDR);
open(I, "<$TOPO_FILE") or croak "can't open $TOPO_FILE: $!";

# map from id => [ [ip,port], ... ]
our %ID_MAP;
# each of the vservers on this host
our %MY_IDS;
# list of all the unique (real) hosts => [ids]
our %HOSTS;

# host to prio map: id => [ handle, { host => [ handle, { id => prio } ] } ]
our %VSERVER2PRIO;

our $HANDLE = 2;

do_init_filters();

my $node_phase = 1;
while (<I>) {
    chomp;
    
    if (/^node\s+(\d+)\s+(.*)/) {
	my $id = $1;
	next if $id eq $HACKY_BOOTSTRAP_ID; # hack: skip the bootstrap!

	my @addrs = map { [ split(/:/, $_) ] } split(/\s+/, $2);
	croak "no addrs for $id? $_" if !@addrs;

	croak "dup id $id? $_" if $ID_MAP{$id};
	$ID_MAP{$id} = \@addrs;
	if (!defined $HOSTS{$addrs[0]->[0]}) {
	    $HOSTS{$addrs[0]->[0]} = [ $id ];
	} else {
	    push @{$HOSTS{$addrs[0]->[0]}}, $id;
	}

	if ($addrs[0]->[0] eq $LOCALADDR) {
	    $MY_IDS{$id} = 1;
	}
    } elsif (/^(\d+),(\d+)\s+([\d\.]+)/) {
	if ($node_phase) {
	    do_host_filters();
	    $node_phase = 0;
	}

	my ($a, $b, $lat) = ($1, $2, $3);

	next if $a eq $HACKY_BOOTSTRAP_ID or $b eq $HACKY_BOOTSTRAP_ID;
	
	if ($MY_IDS{$a}) {
	    do_lat_filter($a, $b, int($lat/2));
	}
	if ($MY_IDS{$b}) {
	    do_lat_filter($b, $a, int($lat/2));
	}
    }
}

###############################################################################

sub do_init_filters()
{
    comment("restarting classifiers");
    reset_all();
}

sub gen_handle {
    return sprintf "%x", $HANDLE++;
}

sub two_level_prio
{
    my $bands  = shift;
    my $parent = shift;

    my $parent_str = defined $parent ? "parent $parent" : "root";
    my $handle     = defined $parent ? gen_handle() : "1";

    my $children = ceil($bands/16);
    my $parent_bands = max($children+1, 3);
    
    qdisc("add", "$parent_str handle $handle: prio bands $parent_bands");

    my @children_handles;
    for (my $i=1; $i<=$children; $i++) {
	push @children_handles, gen_handle();
	my $cbands;
	if ($i < $children) {
	    $cbands = 16;
	} elsif ($children == 1) {
	    $cbands = $bands;
	} else {
	    $cbands = ($bands % (($children-1)*16));
	}
	$cbands = max($cbands, 3);
	qdisc("add", "parent $handle:$i handle " . $children_handles[$i-1] . ": prio bands $cbands");
    }

    return [ $handle, \@children_handles ];
}

sub two_level_prio_info
{
    my $prio  = shift;

    my $prio_level1 = 1 + int(($prio-1)/16);
    my $prio_level2 = $prio - ($prio_level1-1)*16;

    my $prio_level1_hex = sprintf "%x", $prio_level1;
    my $prio_level2_hex = sprintf "%x", $prio_level2;

    return ($prio_level1, $prio_level1_hex,
	    $prio_level2, $prio_level2_hex);
}

sub two_level_prio_parent
{
    my $handle = shift;
    my $prio  = shift;

    my $root_handle = $handle->[0];
    my $children_handles = $handle->[1];

    my ($prio_level1, $prio_level1_hex,
	$prio_level2, $prio_level2_hex) = 
	    two_level_prio_info($prio);
    my $prio_level2_handle = $children_handles->[$prio_level1-1];

    return "$prio_level2_handle:$prio_level2_hex";
}

sub two_level_prio_default_parent
{
    my $handle = shift;
    my $root_handle = $handle->[0];
    my $children_handles = $handle->[1];

    my $nchildren = scalar @{$children_handles};

    my $prio_hex = sprintf "%x", $nchildren+1;

    return "$root_handle:$prio_hex";
}

sub two_level_prio_filter
{
    my $handle = shift;
    my $prio  = shift;
    my $filter = shift;

    my $root_handle = $handle->[0];
    my $children_handles = $handle->[1];

    my ($prio_level1, $prio_level1_hex,
	$prio_level2, $prio_level2_hex) = 
	    two_level_prio_info($prio);
    my $prio_level2_handle = $children_handles->[$prio_level1-1];

    filter("add", "protocol ip parent $root_handle: prio $prio_level1 $filter flowid 1:$prio_level1_hex");
    filter("add", "protocol ip parent $prio_level2_handle: prio $prio_level2 $filter flowid $prio_level2_handle:$prio_level2_hex");
}

sub two_level_prio_default_filter
{
    my $handle = shift;
    my $filter = shift;

    my $root_handle = $handle->[0];
    my $children_handles = $handle->[1];
    my $nchildren = scalar @{$children_handles};

    my $prio     = $nchildren+1;
    my $prio_hex = sprintf "%x", $prio;

    filter("add", "protocol ip parent $root_handle: prio $prio $filter flowid 1:$prio_hex");
}

sub do_host_filters()
{
    my $vserv_bands = (scalar keys %MY_IDS);
    my $host_bands = (scalar keys %HOSTS);
    croak "too many vservers" if $vserv_bands > 15*16;
    croak "too many hosts" if $host_bands > 15*16;

    comment("classify into a class for each local vserver");

    my $root_handle = two_level_prio($vserv_bands, undef);

    # (1) do level-1 local vserver queues
    my $prio = 1;
    foreach my $v (keys %MY_IDS) {
	comment("classifier for $v (" . join(",", map { $_->[1] } @{$ID_MAP{$v}}). ")");

	foreach my $addr (@{$ID_MAP{$v}}) {
	    my $port = $addr->[1];
	    two_level_prio_filter($root_handle, $prio,
				  "u32 match ip sport $port 0xffff");
	    two_level_prio_filter($root_handle, $prio,
				  "u32 match ip protocol 6 0xff match tcp src $port 0xffff");
	}

	# if outbound rate is defined, stick a token bucket infront of this
	# vserver to do rate limiting!
	my $parent = two_level_prio_parent($root_handle, $prio);
	if ($OUTBOUND_TBF_RATE) {
	    my $tbf_handle = gen_handle();
	    qdisc("add", "parent $parent handle $tbf_handle: tbf rate $OUTBOUND_TBF_RATE burst $OUTBOUND_TBF_BURST latency $OUTBOUND_TBF_LATENCY");
	    $parent = "$tbf_handle:1";
	}

	my $vserv_handle = two_level_prio($host_bands, $parent);
	$VSERVER2PRIO{$v} = [ $vserv_handle, {} ];

	$prio++;
    }

    {   # default qdisc
	comment("default queue");
	two_level_prio_default_filter($root_handle, "u32 match ip protocol 0 0x00");
	my $parent = two_level_prio_default_parent($root_handle);
	qdisc("add", "parent $parent pfifo");
    }

    # (2) do level-2 remote host queues
    foreach my $v (keys %VSERVER2PRIO) {
	my $vserv_handle = $VSERVER2PRIO{$v}->[0];

	my $hprio = 1;
	foreach my $h (keys %HOSTS) {
	    next if $h eq $LOCALADDR; # xxx remove me

	    my $target_vserv_bands = scalar @{$HOSTS{$h}};
	    croak "too many target vservers on for $h" if $target_vserv_bands > 15*16;
	    
	    comment("  classifier for $h");
	    two_level_prio_filter($vserv_handle, $hprio, "u32 match ip dst $h/32");
	    my $parent = two_level_prio_parent($vserv_handle, $hprio);
	    my $host_handle = two_level_prio($target_vserv_bands, $parent);
	    $VSERVER2PRIO{$v}->[1]->{$h} = [ $host_handle, {} ];

	    $hprio++;
	}

	{   # default qdisc
	    comment("  default queue for $v");
	    two_level_prio_default_filter($vserv_handle, "u32 match ip protocol 0 0x00");
	    my $parent = two_level_prio_default_parent($vserv_handle);
	    qdisc("add", "parent $parent pfifo");
	}
    }

    # (3) do level-3 remote vserver queues
    foreach my $v (keys %VSERVER2PRIO) {
	foreach my $h (keys %HOSTS) {
	    next if $h eq $LOCALADDR; # xxx remove me

	    my $host_handle = $VSERVER2PRIO{$v}->[1]->{$h}->[0];

	    my $vprio = 1;
	    foreach my $vserv (@{$HOSTS{$h}}) {
		$VSERVER2PRIO{$v}->[1]->{$h}->[1]->{$vserv} = $vprio;
		# each of these qdiscs get added later, as we see the
		# latency pair in the input file (do_lat_filter)
		foreach my $addr (@{$ID_MAP{$vserv}}) {
		    my $port = $addr->[1];
		    two_level_prio_filter($host_handle, $vprio, 
					  "u32 match ip dport $port 0xffff");
		    two_level_prio_filter($host_handle, $vprio, 
					  "u32 match ip protocol 6 0xff match tcp dst $port 0xffff");
		}

		$vprio++;
	    }

	    {   # default prio for this remote vserver
		comment("    default queue for $h");
		two_level_prio_default_filter($host_handle, "u32 match ip protocol 0 0x00");
		my $parent = two_level_prio_default_parent($host_handle);
		qdisc("add", "parent $parent pfifo");
	    }
	}
    }
}

sub get_vserv_str($)
{
    my $v = shift;

    my $ports = $ID_MAP{$v};
    my $host = $ports->[0]->[0];

    my $ret = "$host:" . join(",", map { $_->[1] } @$ports);
    return $ret;
}

sub do_lat_filter($$$)
{
    my ($from, $to, $delay) = @_;

    my $to_ports = $ID_MAP{$to};
    my $to_host = $to_ports->[0]->[0];

    return if !defined $ID_MAP{$to}; # xxx remove me 
    return if $to_host eq $LOCALADDR; # xxx remove me

    my $host_handle = $VSERVER2PRIO{$from}->[1]->{$to_host}->[0];
    my $prio = $VSERVER2PRIO{$from}->[1]->{$to_host}->[1]->{$to};
    croak "missing vserver $to for $to_host" if !defined $prio;

    my $parent = two_level_prio_parent($host_handle, $prio);

    comment("      latency from " . get_vserv_str($from) . 
	    " ($from) to " . get_vserv_str($to) . " ($to)");

    my $handle = gen_handle();

    # xxx add latency variation?
    qdisc("add", "parent $parent handle $handle: netem delay ${delay}ms");
}

###############################################################################

sub reset_all()
{
    filter("del", "root >/dev/null 2>&1", 1);
    qdisc("del", "root >/dev/null 2>&1", 1);
}

sub comment($)
{
    print("# $_[0]\n");
    print("echo \"[35m[$_[0]][m\" 1>&2\n");
}

sub filter($$@)
{
    my $cmd  = shift;
    my $args = shift;
    my $no_fail_fast = shift;

    my $cmdline = "/sbin/tc filter $cmd dev $DEV $args";
    if ($FAIL_FAST && !$no_fail_fast) {
	$cmdline .= " || exit -1";
    }
    print("echo \"[35m$cmdline[m\" 1>&2\n");
    print("$cmdline\n");
}

sub qdisc($$@)
{
    my $cmd  = shift;
    my $args = shift;
    my $no_fail_fast = shift;

    my $cmdline = "/sbin/tc qdisc $cmd dev $DEV $args";
    if ($FAIL_FAST && !$no_fail_fast) {
	$cmdline .= " || exit -1";
    }
    print("echo \"[35m$cmdline[m\" 1>&2\n");
    print("$cmdline\n");
}

sub get_dev($)
{
    my $addr = shift;

    my $curr_dev;

    foreach my $line (split /\n/, `/sbin/ip addr`) {
	if ($line =~ /^\d+:\s+(\w+):/) {
	    $curr_dev = $1;
	} elsif ($line =~ /inet\s+(\d+\.\d+\.\d+\.\d+)\/(\d+)/) {
	    my ($ip, $mask) = ($1, $2);
	    if ($ip eq $addr) {
		croak "don't have curr_dev?" if !$curr_dev;
		return $curr_dev;
	    }
	}
    }
    
    croak "couldn't find dev for $addr";
}

sub max($$)
{
    return $_[0] > $_[1] ? $_[0] : $_[1];
}

sub min($$)
{
    return $_[0] < $_[1] ? $_[0] : $_[1];
}
