From e6546ef6d86d0fc38e0e84ccae80e641f3fc0087 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:39 -0700 Subject: [PATCH 1/5] bpf: add support for BPF_SOCK_OPS_BASE_RTT A congestion control algorithm can make a call to the BPF socket_ops program to request the base RTT. The base RTT can be congestion control dependent and is meant to represent a congestion threshold such that RTTs above it indicate congestion. This is especially useful for flows within a DC where the base RTT is easy to obtain. Being provided a base RTT solves a basic problem in RTT based congestion avoidance algorithms (such as Vegas, NV and BBR). Although it is easy to get the base RTT when the network is not congested, it is very diffcult to do when it is very congested. Newer connections get an inflated value of the base RTT leading to unfariness (newer flows with a larger base RTT get more bandwidth). As a result, RTT based congestion avoidance algorithms tend to update their base RTTs to improve fairness. In very congested networks this can lead to base RTT inflation, reducing the ability of these RTT based congestion control algorithms to prevent congestion. Note that in my experiments with TCP-NV, the base RTT provided can be much larger than the actual hardware RTT. For example, experimenting with hosts within a rack where the hardware RTT is 16-20us, I've used base RTTs up to 150us. The effect of using a larger base RTT is that the congestion avoidance algorithm will allow more queueing. When there are only a few flows the main effect is larger measured RTTs and RPC latencies due to the increased queueing. When there are a lot of flows, a larger base RTT can lead to more congestion and more packet drops. For this case, where the hardware RTT is 20us, a base RTT of 80us produces good results. This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this set adds support for using it in TCP-NV. Further study and testing is needed before support can be added to other delay based congestion avoidance algorithms. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83f95ea6a1b..1aca744c220f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -955,6 +955,13 @@ enum { BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control * needs ECN */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ From cd86d1fd21025fdd6daf23d1288da405e7ad0ec6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:40 -0700 Subject: [PATCH 2/5] bpf: Adding helper function bpf_getsockops Adding support for helper function bpf_getsockops to socket_ops BPF programs. This patch only supports TCP_CONGESTION. Signed-off-by: Vlad Vysotsky Acked-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 19 ++++++++-- net/core/filter.c | 46 ++++++++++++++++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 3 ++ 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1aca744c220f..f650346aaa1a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -613,12 +613,22 @@ union bpf_attr { * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) * Calls setsockopt. Not all opts are available, only those with * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPROTO_TCP + * Supported levels: SOL_SOCKET and IPPROTO_TCP * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPROTO_TCP + * @level: SOL_SOCKET or IPPROTO_TCP * @optname: option name * @optval: pointer to option value - * @optlen: length of optval in byes + * @optlen: length of optval in bytes + * Return: 0 or negative error + * + * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) + * Calls getsockopt. Not all opts are available. + * Supported levels: IPPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: IPPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in bytes * Return: 0 or negative error * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) @@ -721,7 +731,8 @@ union bpf_attr { FN(sock_map_update), \ FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ - FN(perf_prog_read_value), + FN(perf_prog_read_value), \ + FN(getsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 09e011f20291..ccf62f44140a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3273,7 +3273,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, static const struct bpf_func_proto bpf_setsockopt_proto = { .func = bpf_setsockopt, - .gpl_only = true, + .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, @@ -3282,6 +3282,48 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + + if (!sk_fullsock(sk)) + goto err_clear; + +#ifdef CONFIG_INET + if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + if (optname == TCP_CONGESTION) { + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!icsk->icsk_ca_ops || optlen <= 1) + goto err_clear; + strncpy(optval, icsk->icsk_ca_ops->name, optlen); + optval[optlen - 1] = 0; + } else { + goto err_clear; + } + } else { + goto err_clear; + } + return ret; +#endif +err_clear: + memset(optval, 0, optlen); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_getsockopt_proto = { + .func = bpf_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3460,6 +3502,8 @@ static const struct bpf_func_proto * switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_getsockopt_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index e25dbf6038cf..609514f74482 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -67,6 +67,9 @@ static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_setsockopt; +static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_getsockopt; static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, From 85cce215781685bbc94b6af2bd1aa40b71965a70 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:41 -0700 Subject: [PATCH 3/5] bpf: Add BPF_SOCKET_OPS_BASE_RTT support to tcp_nv TCP_NV will try to get the base RTT from a socket_ops BPF program if one is loaded. NV will then use the base RTT to bound its min RTT (its notion of the base RTT). It uses the base RTT as an upper bound and 80% of the base RTT as its lower bound. In other words, NV will consider filtered RTTs larger than base RTT as a sign of congestion. As a result, there is no minRTT inflation when there is a lot of congestion. For example, in a DC where the RTTs are less than 40us when there is no congestion, a base RTT value of 80us improves the performance of NV. The difference between the uncongested RTT and the base RTT provided represents how much queueing we are willing to have (in practice it can be higher). NV has been tunned to reduce congestion when there are many flows at the cost of one flow not achieving full bandwith utilization. When a reasonable base RTT is provided, one NV flow can now fully utilize the full bandwidth. In addition, the performance is also improved when there are many flows. In the following examples the NV results are using a kernel with this patch set (i.e. both NV results are using the new nv_loss_dec_factor). With one host sending to another host and only one flow the goodputs are: Cubic: 9.3 Gbps, NV: 5.5 Gbps, NV (baseRTT=80us): 9.2 Gbps With 2 hosts sending to one host (1 flow per host, the goodput per flow is: Cubic: 4.6 Gbps, NV: 4.5 Gbps, NV (baseRTT=80us)L 4.6 Gbps But the RTTs seen by a ping process in the sender is: Cubic: 3.3ms NV: 97us, NV (baseRTT=80us): 146us With a lot of flows things look even better for NV with baseRTT. Here we have 3 hosts sending to one host. Each sending host has 6 flows: 1 stream, 4x1MB RPC, 1x10KB RPC. Cubic, NV and NV with baseRTT all fully utilize the full available bandwidth. However, the distribution of bandwidth among the flows is very different. For the 10KB RPC flow: Cubic: 27Mbps, NV: 111Mbps, NV (baseRTT=80us): 222Mbps The 99% latencies for the 10KB flows are: Cubic: 26ms, NV: 1ms, NV (baseRTT=80us): 500us The RTT seen by a ping process at the senders: Cubic: 3.2ms NV: 720us, NV (baseRTT=80us): 330us Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_nv.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 1ff73982e28c..a978e3fa71ec 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -39,7 +39,7 @@ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected * nv_ssthresh_factor On congestion set ssthresh to this * / 8 * nv_rtt_factor RTT averaging factor - * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping @@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2; static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ -static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ static int nv_cwnd_growth_rate_neg __read_mostly = 8; static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ static int nv_dec_eval_min_calls __read_mostly = 60; @@ -101,6 +101,11 @@ struct tcpnv { u32 nv_last_rtt; /* last rtt */ u32 nv_min_rtt; /* active min rtt. Used to determine slope */ u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_base_rtt; /* If non-zero it represents the threshold for + * congestion */ + u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is + * set to 80% of nv_base_rtt. It helps reduce + * unfairness between flows */ u32 nv_rtt_max_rate; /* max rate seen during current RTT */ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives * acking beyond nv_rtt_start_seq */ @@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) static void tcpnv_init(struct sock *sk) { struct tcpnv *ca = inet_csk_ca(sk); + int base_rtt; tcpnv_reset(ca, sk); + /* See if base_rtt is available from socket_ops bpf program. + * It is meant to be used in environments, such as communication + * within a datacenter, where we have reasonable estimates of + * RTTs + */ + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + if (base_rtt > 0) { + ca->nv_base_rtt = base_rtt; + ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ + } else { + ca->nv_base_rtt = 0; + ca->nv_lower_bound_rtt = 0; + } + ca->nv_allow_cwnd_growth = 1; ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; ca->nv_min_rtt = NV_INIT_RTT; @@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk) ca->cwnd_growth_factor = 0; } +/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) + * bounds to RTT. + */ +inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) +{ + if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) + return ca->nv_lower_bound_rtt; + else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) + return ca->nv_base_rtt; + else + return val; +} + static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -265,6 +298,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) if (ca->nv_eval_call_cnt < 255) ca->nv_eval_call_cnt++; + /* Apply bounds to rtt. Only used to update min_rtt */ + avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); + /* update min rtt if necessary */ if (avg_rtt < ca->nv_min_rtt) ca->nv_min_rtt = avg_rtt; From c890063e440456e75c2e70f6bcec3797f1771eb6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:42 -0700 Subject: [PATCH 4/5] bpf: sample BPF_SOCKET_OPS_BASE_RTT program Sample socket_ops BPF program to test the BPF helper function bpf_getsocketops and the new socket_ops op BPF_SOCKET_OPS_BASE_RTT. The program provides a base RTT of 80us when the calling flow is within a DC (as determined by the IPV6 prefix) and the congestion algorithm is "nv". Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked_by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/Makefile | 1 + samples/bpf/tcp_basertt_kern.c | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 samples/bpf/tcp_basertt_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3534ccfb5920..ea2b9e6135f3 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -129,6 +129,7 @@ always += tcp_bufs_kern.o always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o +always += tcp_basertt_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000000..4bf4fc597db9 --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set base_rtt to 80us when host is running TCP-NV and + * both hosts are in the same datacenter (as determined by IPv6 prefix). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char cong[20]; + char nv[] = "nv"; + int rv = 0, n; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_BASE_RTT: + n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) { + /* Set base_rtt to 80us */ + rv = 80; + } else if (n) { + rv = n; + } else { + rv = -1; + } + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; From bfdf75693875fd53d6f08d5fec9506a864f07372 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:43 -0700 Subject: [PATCH 5/5] bpf: create samples/bpf/tcp_bpf.readme Readme file explaining how to create a cgroupv2 and attach one of the tcp_*_kern.o socket_ops BPF program. Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked_by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/tcp_bbf.readme | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 samples/bpf/tcp_bbf.readme diff --git a/samples/bpf/tcp_bbf.readme b/samples/bpf/tcp_bbf.readme new file mode 100644 index 000000000000..831fb601e3c9 --- /dev/null +++ b/samples/bpf/tcp_bbf.readme @@ -0,0 +1,26 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2 To load +(attach) one of the tcp_*_kern.o programs: + + ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o + +If the "-l" flag is used, the load_sock_ops program will continue to run +printing the BPF log buffer. The tcp_*_kern.o programs use special print +functions to print logging information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + ./load_sock_ops -r /tmp/cgroupv2/foo