net: extend sk_pacing_rate to unsigned long

sk_pacing_rate has beed introduced as a u32 field in 2013,
effectively limiting per flow pacing to 34Gbit.

We believe it is time to allow TCP to pace high speed flows
on 64bit hosts, as we now can reach 100Gbit on one TCP flow.

This patch adds no cost for 32bit kernels.

The tcpi_pacing_rate and tcpi_max_pacing_rate were already
exported as 64bit, so iproute2/ss command require no changes.

Unfortunately the SO_MAX_PACING_RATE socket option will stay
32bit and we will need to add a new option to let applications
control high pacing rates.

State      Recv-Q Send-Q Local Address:Port             Peer Address:Port
ESTAB      0      1787144  10.246.9.76:49992             10.246.9.77:36741
                 timer:(on,003ms,0) ino:91863 sk:2 <->
 skmem:(r0,rb540000,t66440,tb2363904,f605944,w1822984,o0,bl0,d0)
 ts sack bbr wscale:8,8 rto:201 rtt:0.057/0.006 mss:1448
 rcvmss:536 advmss:1448
 cwnd:138 ssthresh:178 bytes_acked:256699822585 segs_out:177279177
 segs_in:3916318 data_segs_out:177279175
 bbr:(bw:31276.8Mbps,mrtt:0,pacing_gain:1.25,cwnd_gain:2)
 send 28045.5Mbps lastrcv:73333
 pacing_rate 38705.0Mbps delivery_rate 22997.6Mbps
 busy:73333ms unacked:135 retrans:0/157 rcv_space:14480
 notsent:2085120 minrtt:0.013

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2018-10-15 09:37:53 -07:00 committed by David S. Miller
parent 5f6188a800
commit 76a9ebe811
7 changed files with 40 additions and 32 deletions

View File

@ -422,8 +422,8 @@ struct sock {
struct timer_list sk_timer; struct timer_list sk_timer;
__u32 sk_priority; __u32 sk_priority;
__u32 sk_mark; __u32 sk_mark;
u32 sk_pacing_rate; /* bytes per second */ unsigned long sk_pacing_rate; /* bytes per second */
u32 sk_max_pacing_rate; unsigned long sk_max_pacing_rate;
struct page_frag sk_frag; struct page_frag sk_frag;
netdev_features_t sk_route_caps; netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps; netdev_features_t sk_route_nocaps;

View File

@ -3927,8 +3927,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
break; break;
case SO_MAX_PACING_RATE: case SO_MAX_PACING_RATE: /* 32bit version */
sk->sk_max_pacing_rate = val; sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate); sk->sk_max_pacing_rate);
break; break;

View File

@ -998,7 +998,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
cmpxchg(&sk->sk_pacing_status, cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE, SK_PACING_NONE,
SK_PACING_NEEDED); SK_PACING_NEEDED);
sk->sk_max_pacing_rate = val; sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate); sk->sk_max_pacing_rate);
break; break;
@ -1336,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
#endif #endif
case SO_MAX_PACING_RATE: case SO_MAX_PACING_RATE:
v.val = sk->sk_max_pacing_rate; /* 32bit version */
v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
break; break;
case SO_INCOMING_CPU: case SO_INCOMING_CPU:
@ -2810,8 +2811,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_ll_usec = sysctl_net_busy_read; sk->sk_ll_usec = sysctl_net_busy_read;
#endif #endif
sk->sk_max_pacing_rate = ~0U; sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0U; sk->sk_pacing_rate = ~0UL;
sk->sk_pacing_shift = 10; sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1; sk->sk_incoming_cpu = -1;

View File

@ -3111,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{ {
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long rate;
u32 now; u32 now;
u64 rate64; u64 rate64;
bool slow; bool slow;
u32 rate;
memset(info, 0, sizeof(*info)); memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM) if (sk->sk_type != SOCK_STREAM)
@ -3124,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
/* Report meaningful fields for all TCP states, including listeners */ /* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate); rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL; rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_pacing_rate = rate64; info->tcpi_pacing_rate = rate64;
rate = READ_ONCE(sk->sk_max_pacing_rate); rate = READ_ONCE(sk->sk_max_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL; rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_max_pacing_rate = rate64; info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering; info->tcpi_reordering = tp->reordering;
@ -3254,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats; struct sk_buff *stats;
struct tcp_info info; struct tcp_info info;
unsigned long rate;
u64 rate64; u64 rate64;
u32 rate;
stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats) if (!stats)
@ -3274,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->total_retrans, TCP_NLA_PAD); tp->total_retrans, TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate); rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL; rate64 = (rate != ~0UL) ? rate : ~0ULL;
nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
rate64 = tcp_compute_delivery_rate(tp); rate64 = tcp_compute_delivery_rate(tp);

View File

@ -219,7 +219,7 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
} }
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{ {
u64 rate = bw; u64 rate = bw;
@ -258,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk); struct bbr *bbr = inet_csk_ca(sk);
u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk); bbr_init_pacing_rate_from_rtt(sk);
@ -280,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring /* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size. * driver provided sk_gso_max_size.
*/ */
bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));

View File

@ -991,14 +991,14 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
skb->skb_mstamp_ns = tp->tcp_wstamp_ns; skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) { if (sk->sk_pacing_status != SK_PACING_NONE) {
u32 rate = sk->sk_pacing_rate; unsigned long rate = sk->sk_pacing_rate;
/* Original sch_fq does not pace first 10 MSS /* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets, * Note that tp->data_segs_out overflows after 2^32 packets,
* this is a minor annoyance. * this is a minor annoyance.
*/ */
if (rate != ~0U && rate && tp->data_segs_out >= 10) { if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate); tp->tcp_wstamp_ns += div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
tcp_internal_pacing(sk); tcp_internal_pacing(sk);
} }
@ -1704,8 +1704,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
{ {
u32 bytes, segs; u32 bytes, segs;
bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, bytes = min_t(unsigned long,
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); sk->sk_pacing_rate >> sk->sk_pacing_shift,
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms, /* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms. * not one big TSO packet every 100 ms.
@ -2198,10 +2199,12 @@ static bool tcp_pacing_check(const struct sock *sk)
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor) unsigned int factor)
{ {
unsigned int limit; unsigned long limit;
limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); limit = max_t(unsigned long,
limit = min_t(u32, limit, 2 * skb->truesize,
sk->sk_pacing_rate >> sk->sk_pacing_shift);
limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor; limit <<= factor;

View File

@ -92,8 +92,8 @@ struct fq_sched_data {
u32 quantum; u32 quantum;
u32 initial_quantum; u32 initial_quantum;
u32 flow_refill_delay; u32 flow_refill_delay;
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */ u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
u32 orphan_mask; /* mask for orphaned skb */ u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold; u32 low_rate_threshold;
struct rb_root *fq_root; struct rb_root *fq_root;
@ -416,7 +416,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head; struct fq_flow_head *head;
struct sk_buff *skb; struct sk_buff *skb;
struct fq_flow *f; struct fq_flow *f;
u32 rate, plen; unsigned long rate;
u32 plen;
skb = fq_dequeue_head(sch, &q->internal); skb = fq_dequeue_head(sch, &q->internal);
if (skb) if (skb)
@ -485,11 +486,11 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
if (f->credit > 0) if (f->credit > 0)
goto out; goto out;
} }
if (rate != ~0U) { if (rate != ~0UL) {
u64 len = (u64)plen * NSEC_PER_SEC; u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate)) if (likely(rate))
do_div(len, rate); len = div64_ul(len, rate);
/* Since socket rate can change later, /* Since socket rate can change later,
* clamp the delay to 1 second. * clamp the delay to 1 second.
* Really, providers of too big packets should be fixed ! * Really, providers of too big packets should be fixed !
@ -701,9 +702,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
if (tb[TCA_FQ_FLOW_MAX_RATE]) if (tb[TCA_FQ_FLOW_MAX_RATE]) {
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
}
if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
q->low_rate_threshold = q->low_rate_threshold =
nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
@ -766,7 +769,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->quantum = 2 * psched_mtu(qdisc_dev(sch)); q->quantum = 2 * psched_mtu(qdisc_dev(sch));
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40); q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U; q->flow_max_rate = ~0UL;
q->time_next_delayed_flow = ~0ULL; q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1; q->rate_enable = 1;
q->new_flows.first = NULL; q->new_flows.first = NULL;
@ -802,7 +805,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
min_t(unsigned long, q->flow_max_rate, ~0U)) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) || jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||