Merge branch 'ipv4-Control-SKB-reprioritization-after-forwarding'

Petr Machata says:

====================
ipv4: Control SKB reprioritization after forwarding

After IPv4 packets are forwarded, the priority of the corresponding SKB
is updated according to the TOS field of IPv4 header. This overrides any
prioritization done earlier by e.g. an skbedit action or ingress-qos-map
defined at a vlan device.

Such overriding may not always be desirable. Even if the packet ends up
being routed, which implies this is an L3 network node, an administrator
may wish to preserve whatever prioritization was done earlier on in the
pipeline.

Therefore this patch set introduces a sysctl that controls this
behavior, net.ipv4.ip_forward_update_priority. It's value is 1 by
default to preserve the current behavior.

All of the above is implemented in patch #1.

Value changes prompt a new NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE
notification, so that the drivers can hook up whatever logic may depend
on this value. That is implemented in patch #2.

In patches #3 and #4, mlxsw is adapted to recognize the sysctl. On
initialization, the RGCR register that handles router configuration is
set in accordance with the sysctl. The new notification is listened to
and RGCR is reconfigured as necessary.

In patches #5 to #7, a selftest is added to verify that mlxsw reflects
the sysctl value as necessary. The test is expressed in terms of the
recently-introduced ieee_setapp support, and works by observing how DSCP
value gets rewritten depending on packet priority. For this reason, the
test is added to the subdirectory drivers/net/mlxsw. Even though it's
not particularly specific to mlxsw, it's not suitable for running on
soft devices (which don't support the ieee_setapp et.al.).

Changes from v1 to v2:

- In patch #1, init sysctl_ip_fwd_update_priority to 1 instead of true.

Changes from RFC to v1:

- Fix wrong sysctl name in ip-sysctl.txt
- Add notifications
- Add mlxsw support
- Add self test
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-08-01 09:52:31 -07:00
commit 53dd9652b5
10 changed files with 379 additions and 79 deletions

View File

@ -81,6 +81,15 @@ fib_multipath_hash_policy - INTEGER
0 - Layer 3
1 - Layer 4
ip_forward_update_priority - INTEGER
Whether to update SKB priority from "TOS" field in IPv4 header after it
is forwarded. The new SKB priority is mapped from TOS field value
according to an rt_tos2priority table (see e.g. man tc-prio).
Default: 1 (Update priority.)
Possible values:
0 - Do not update priority.
1 - Update priority.
route/max_size - INTEGER
Maximum number of routes allowed in the kernel. Increase
this when using large numbers of interfaces and/or routes.

View File

@ -2436,17 +2436,48 @@ static void mlxsw_sp_router_mp_hash_event_work(struct work_struct *work)
kfree(net_work);
}
static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp);
static void mlxsw_sp_router_update_priority_work(struct work_struct *work)
{
struct mlxsw_sp_netevent_work *net_work =
container_of(work, struct mlxsw_sp_netevent_work, work);
struct mlxsw_sp *mlxsw_sp = net_work->mlxsw_sp;
__mlxsw_sp_router_init(mlxsw_sp);
kfree(net_work);
}
static int mlxsw_sp_router_schedule_work(struct net *net,
struct notifier_block *nb,
void (*cb)(struct work_struct *))
{
struct mlxsw_sp_netevent_work *net_work;
struct mlxsw_sp_router *router;
if (!net_eq(net, &init_net))
return NOTIFY_DONE;
net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
if (!net_work)
return NOTIFY_BAD;
router = container_of(nb, struct mlxsw_sp_router, netevent_nb);
INIT_WORK(&net_work->work, cb);
net_work->mlxsw_sp = router->mlxsw_sp;
mlxsw_core_schedule_work(&net_work->work);
return NOTIFY_DONE;
}
static int mlxsw_sp_router_netevent_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct mlxsw_sp_netevent_work *net_work;
struct mlxsw_sp_port *mlxsw_sp_port;
struct mlxsw_sp_router *router;
struct mlxsw_sp *mlxsw_sp;
unsigned long interval;
struct neigh_parms *p;
struct neighbour *n;
struct net *net;
switch (event) {
case NETEVENT_DELAY_PROBE_TIME_UPDATE:
@ -2500,20 +2531,12 @@ static int mlxsw_sp_router_netevent_event(struct notifier_block *nb,
break;
case NETEVENT_IPV4_MPATH_HASH_UPDATE:
case NETEVENT_IPV6_MPATH_HASH_UPDATE:
net = ptr;
return mlxsw_sp_router_schedule_work(ptr, nb,
mlxsw_sp_router_mp_hash_event_work);
if (!net_eq(net, &init_net))
return NOTIFY_DONE;
net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
if (!net_work)
return NOTIFY_BAD;
router = container_of(nb, struct mlxsw_sp_router, netevent_nb);
INIT_WORK(&net_work->work, mlxsw_sp_router_mp_hash_event_work);
net_work->mlxsw_sp = router->mlxsw_sp;
mlxsw_core_schedule_work(&net_work->work);
break;
case NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE:
return mlxsw_sp_router_schedule_work(ptr, nb,
mlxsw_sp_router_update_priority_work);
}
return NOTIFY_DONE;
@ -7382,6 +7405,7 @@ static int mlxsw_sp_dscp_init(struct mlxsw_sp *mlxsw_sp)
static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
bool usp = init_net.ipv4.sysctl_ip_fwd_update_priority;
char rgcr_pl[MLXSW_REG_RGCR_LEN];
u64 max_rifs;
int err;
@ -7392,7 +7416,7 @@ static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
mlxsw_reg_rgcr_pack(rgcr_pl, true, true);
mlxsw_reg_rgcr_max_router_interfaces_set(rgcr_pl, max_rifs);
mlxsw_reg_rgcr_usp_set(rgcr_pl, true);
mlxsw_reg_rgcr_usp_set(rgcr_pl, usp);
err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
if (err)
return err;

View File

@ -28,6 +28,7 @@ enum netevent_notif_type {
NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */
NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, /* arg is struct net ptr */
};
int register_netevent_notifier(struct notifier_block *nb);

View File

@ -98,6 +98,7 @@ struct netns_ipv4 {
int sysctl_ip_default_ttl;
int sysctl_ip_no_pmtu_disc;
int sysctl_ip_fwd_use_pmtu;
int sysctl_ip_fwd_update_priority;
int sysctl_ip_nonlocal_bind;
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;

View File

@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net)
* We set them here, in case sysctl is not compiled.
*/
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
net->ipv4.sysctl_ip_fwd_update_priority = 1;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;

View File

@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
skb->priority = rt_tos2priority(iph->tos);
if (net->ipv4.sysctl_ip_fwd_update_priority)
skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,

View File

@ -201,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
struct net *net;
int ret;
net = container_of(table->data, struct net,
ipv4.sysctl_ip_fwd_update_priority);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
net);
return ret;
}
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@ -663,6 +680,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_forward_update_priority",
.data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = ipv4_fwd_update_priority,
.extra1 = &zero,
.extra2 = &one,
},
{
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,

View File

@ -34,36 +34,6 @@ lib_dir=$(dirname $0)/../../../net/forwarding
NUM_NETIFS=4
source $lib_dir/lib.sh
__dscp_capture_add_del()
{
local add_del=$1; shift
local dev=$1; shift
local base=$1; shift
local dscp;
for prio in {0..7}; do
dscp=$((base + prio))
__icmp_capture_add_del $add_del $dscp "" $dev \
"ip_tos $((dscp << 2))"
done
}
dscp_capture_install()
{
local dev=$1; shift
local base=$1; shift
__dscp_capture_add_del add $dev $base
}
dscp_capture_uninstall()
{
local dev=$1; shift
local base=$1; shift
__dscp_capture_add_del del $dev $base
}
h1_create()
{
local dscp;
@ -103,16 +73,6 @@ dscp_map()
done
}
lldpad_wait()
{
local dev=$1; shift
while lldptool -t -i $dev -V APP -c app | grep -q pending; do
echo "$dev: waiting for lldpad to push pending APP updates"
sleep 5
done
}
switch_create()
{
ip link add name br1 type bridge vlan_filtering 1
@ -124,22 +84,15 @@ switch_create()
lldptool -T -i $swp1 -V APP $(dscp_map 10) >/dev/null
lldptool -T -i $swp2 -V APP $(dscp_map 20) >/dev/null
lldpad_wait $swp1
lldpad_wait $swp2
lldpad_app_wait_set $swp1
lldpad_app_wait_set $swp2
}
switch_destroy()
{
lldptool -T -i $swp2 -V APP -d $(dscp_map 20) >/dev/null
lldptool -T -i $swp1 -V APP -d $(dscp_map 10) >/dev/null
# Give lldpad a chance to push down the changes. If the device is downed
# too soon, the updates will be left pending, but will have been struck
# off the lldpad's DB already, and we won't be able to tell. Then on
# next test iteration this would cause weirdness as newly-added APP
# rules conflict with the old ones, sometimes getting stuck in an
# "unknown" state.
sleep 5
lldpad_app_wait_del
ip link set dev $swp2 nomaster
ip link set dev $swp1 nomaster
@ -172,18 +125,6 @@ cleanup()
vrf_cleanup
}
dscp_fetch_stats()
{
local dev=$1; shift
local base=$1; shift
for prio in {0..7}; do
local dscp=$((base + prio))
local t=$(tc_rule_stats_get $dev $dscp)
echo "[$dscp]=$t "
done
}
ping_ipv4()
{
ping_test $h1 192.0.2.2

View File

@ -0,0 +1,233 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Test for DSCP prioritization in the router.
#
# With ip_forward_update_priority disabled, the packets are expected to keep
# their DSCP (which in this test uses only values 0..7) intact as they are
# forwarded by the switch. That is verified at $h2. ICMP responses are formed
# with the same DSCP as the requests, and likewise pass through the switch
# intact, which is verified at $h1.
#
# With ip_forward_update_priority enabled, router reprioritizes the packets
# according to the table in reprioritize(). Thus, say, DSCP 7 maps to priority
# 4, which on egress maps back to DSCP 4. The response packet then gets
# reprioritized to 6, getting DSCP 6 on egress.
#
# +----------------------+ +----------------------+
# | H1 | | H2 |
# | + $h1 | | $h2 + |
# | | 192.0.2.1/28 | | 192.0.2.18/28 | |
# +----|-----------------+ +----------------|-----+
# | |
# +----|----------------------------------------------------------------|-----+
# | SW | | |
# | + $swp1 $swp2 + |
# | 192.0.2.2/28 192.0.2.17/28 |
# | APP=0,5,0 .. 7,5,7 APP=0,5,0 .. 7,5,7 |
# +---------------------------------------------------------------------------+
ALL_TESTS="
ping_ipv4
test_update
test_no_update
"
lib_dir=$(dirname $0)/../../../net/forwarding
NUM_NETIFS=4
source $lib_dir/lib.sh
reprioritize()
{
local in=$1; shift
# This is based on rt_tos2priority in include/net/route.h. Assuming 1:1
# mapping between priorities and TOS, it yields a new priority for a
# packet with ingress priority of $in.
local -a reprio=(0 0 2 2 6 6 4 4)
echo ${reprio[$in]}
}
h1_create()
{
local dscp;
simple_if_init $h1 192.0.2.1/28
tc qdisc add dev $h1 clsact
dscp_capture_install $h1 0
ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2
}
h1_destroy()
{
ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2
dscp_capture_uninstall $h1 0
tc qdisc del dev $h1 clsact
simple_if_fini $h1 192.0.2.1/28
}
h2_create()
{
simple_if_init $h2 192.0.2.18/28
tc qdisc add dev $h2 clsact
dscp_capture_install $h2 0
ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17
}
h2_destroy()
{
ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17
dscp_capture_uninstall $h2 0
tc qdisc del dev $h2 clsact
simple_if_fini $h2 192.0.2.18/28
}
dscp_map()
{
local base=$1; shift
for prio in {0..7}; do
echo app=$prio,5,$((base + prio))
done
}
switch_create()
{
simple_if_init $swp1 192.0.2.2/28
__simple_if_init $swp2 v$swp1 192.0.2.17/28
lldptool -T -i $swp1 -V APP $(dscp_map 0) >/dev/null
lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null
lldpad_app_wait_set $swp1
lldpad_app_wait_set $swp2
}
switch_destroy()
{
lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null
lldptool -T -i $swp1 -V APP -d $(dscp_map 0) >/dev/null
lldpad_app_wait_del
__simple_if_fini $swp2 192.0.2.17/28
simple_if_fini $swp1 192.0.2.2/28
}
setup_prepare()
{
h1=${NETIFS[p1]}
swp1=${NETIFS[p2]}
swp2=${NETIFS[p3]}
h2=${NETIFS[p4]}
vrf_prepare
sysctl_set net.ipv4.ip_forward_update_priority 1
h1_create
h2_create
switch_create
}
cleanup()
{
pre_cleanup
switch_destroy
h2_destroy
h1_destroy
sysctl_restore net.ipv4.ip_forward_update_priority
vrf_cleanup
}
ping_ipv4()
{
ping_test $h1 192.0.2.18
}
dscp_ping_test()
{
local vrf_name=$1; shift
local sip=$1; shift
local dip=$1; shift
local prio=$1; shift
local reprio=$1; shift
local dev1=$1; shift
local dev2=$1; shift
local prio2=$($reprio $prio) # ICMP Request egress prio
local prio3=$($reprio $prio2) # ICMP Response egress prio
local dscp=$((prio << 2)) # ICMP Request ingress DSCP
local dscp2=$((prio2 << 2)) # ICMP Request egress DSCP
local dscp3=$((prio3 << 2)) # ICMP Response egress DSCP
RET=0
eval "local -A dev1_t0s=($(dscp_fetch_stats $dev1 0))"
eval "local -A dev2_t0s=($(dscp_fetch_stats $dev2 0))"
ip vrf exec $vrf_name \
${PING} -Q $dscp ${sip:+-I $sip} $dip \
-c 10 -i 0.1 -w 2 &> /dev/null
eval "local -A dev1_t1s=($(dscp_fetch_stats $dev1 0))"
eval "local -A dev2_t1s=($(dscp_fetch_stats $dev2 0))"
for i in {0..7}; do
local dscpi=$((i << 2))
local expect2=0
local expect3=0
if ((i == prio2)); then
expect2=10
fi
if ((i == prio3)); then
expect3=10
fi
local delta=$((dev2_t1s[$i] - dev2_t0s[$i]))
((expect2 == delta))
check_err $? "DSCP $dscpi@$dev2: Expected to capture $expect2 packets, got $delta."
delta=$((dev1_t1s[$i] - dev1_t0s[$i]))
((expect3 == delta))
check_err $? "DSCP $dscpi@$dev1: Expected to capture $expect3 packets, got $delta."
done
log_test "DSCP rewrite: $dscp-(prio $prio2)-$dscp2-(prio $prio3)-$dscp3"
}
__test_update()
{
local update=$1; shift
local reprio=$1; shift
sysctl_restore net.ipv4.ip_forward_update_priority
sysctl_set net.ipv4.ip_forward_update_priority $update
for prio in {0..7}; do
dscp_ping_test v$h1 192.0.2.1 192.0.2.18 $prio $reprio $h1 $h2
done
}
test_update()
{
__test_update 1 reprioritize
}
test_no_update()
{
__test_update 0 echo
}
trap cleanup EXIT
setup_prepare
setup_wait
tests_run
exit $EXIT_STATUS

View File

@ -247,6 +247,27 @@ setup_wait()
sleep $WAIT_TIME
}
lldpad_app_wait_set()
{
local dev=$1; shift
while lldptool -t -i $dev -V APP -c app | grep -q pending; do
echo "$dev: waiting for lldpad to push pending APP updates"
sleep 5
done
}
lldpad_app_wait_del()
{
# Give lldpad a chance to push down the changes. If the device is downed
# too soon, the updates will be left pending. However, they will have
# been struck off the lldpad's DB already, so we won't be able to tell
# they are pending. Then on next test iteration this would cause
# weirdness as newly-added APP rules conflict with the old ones,
# sometimes getting stuck in an "unknown" state.
sleep 5
}
pre_cleanup()
{
if [ "${PAUSE_ON_CLEANUP}" = "yes" ]; then
@ -632,6 +653,48 @@ vlan_capture_uninstall()
__vlan_capture_add_del del 100 "$@"
}
__dscp_capture_add_del()
{
local add_del=$1; shift
local dev=$1; shift
local base=$1; shift
local dscp;
for prio in {0..7}; do
dscp=$((base + prio))
__icmp_capture_add_del $add_del $((dscp + 100)) "" $dev \
"skip_hw ip_tos $((dscp << 2))"
done
}
dscp_capture_install()
{
local dev=$1; shift
local base=$1; shift
__dscp_capture_add_del add $dev $base
}
dscp_capture_uninstall()
{
local dev=$1; shift
local base=$1; shift
__dscp_capture_add_del del $dev $base
}
dscp_fetch_stats()
{
local dev=$1; shift
local base=$1; shift
for prio in {0..7}; do
local dscp=$((base + prio))
local t=$(tc_rule_stats_get $dev $((dscp + 100)))
echo "[$dscp]=$t "
done
}
matchall_sink_create()
{
local dev=$1; shift