mirror of https://gitee.com/openkylin/linux.git
Merge branch 'dctcp'
Daniel Borkmann says: ==================== net: tcp: DCTCP congestion control algorithm This patch series adds support for the DataCenter TCP (DCTCP) congestion control algorithm. Please see individual patches for the details. The last patch adds DCTCP as a congestion control module, and previous ones add needed infrastructure to extend the congestion control framework. Joint work between Florian Westphal, Daniel Borkmann and Glenn Judd. v3 -> v2: - No changes anywhere, just a resend as requested by Dave - Added Stephen's ACK v1 -> v2: - Rebased to latest net-next - Addressed Eric's feedback, thanks! - Update stale comment wrt. DCTCP ECN usage - Don't call INET_ECN_xmit for every packet - Add dctcp ss/inetdiag support to expose internal stats to userspace ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
a11238ec28
|
@ -0,0 +1,43 @@
|
|||
DCTCP (DataCenter TCP)
|
||||
----------------------
|
||||
|
||||
DCTCP is an enhancement to the TCP congestion control algorithm for data
|
||||
center networks and leverages Explicit Congestion Notification (ECN) in
|
||||
the data center network to provide multi-bit feedback to the end hosts.
|
||||
|
||||
To enable it on end hosts:
|
||||
|
||||
sysctl -w net.ipv4.tcp_congestion_control=dctcp
|
||||
|
||||
All switches in the data center network running DCTCP must support ECN
|
||||
marking and be configured for marking when reaching defined switch buffer
|
||||
thresholds. The default ECN marking threshold heuristic for DCTCP on
|
||||
switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
|
||||
but might need further careful tweaking.
|
||||
|
||||
For more details, see below documents:
|
||||
|
||||
Paper:
|
||||
|
||||
The algorithm is further described in detail in the following two
|
||||
SIGCOMM/SIGMETRICS papers:
|
||||
|
||||
i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
|
||||
Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
|
||||
"Data Center TCP (DCTCP)", Data Center Networks session
|
||||
Proc. ACM SIGCOMM, New Delhi, 2010.
|
||||
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
|
||||
http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
|
||||
|
||||
ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
|
||||
"Analysis of DCTCP: Stability, Convergence, and Fairness"
|
||||
Proc. ACM SIGMETRICS, San Jose, 2011.
|
||||
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
|
||||
|
||||
IETF informational draft:
|
||||
|
||||
http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
|
||||
|
||||
DCTCP site:
|
||||
|
||||
http://simula.stanford.edu/~alizade/Site/DCTCP.html
|
|
@ -733,23 +733,6 @@ struct tcp_skb_cb {
|
|||
|
||||
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
|
||||
|
||||
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
|
||||
*
|
||||
* If we receive a SYN packet with these bits set, it means a network is
|
||||
* playing bad games with TOS bits. In order to avoid possible false congestion
|
||||
* notifications, we disable TCP ECN negociation.
|
||||
*/
|
||||
static inline void
|
||||
TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
|
||||
struct net *net)
|
||||
{
|
||||
const struct tcphdr *th = tcp_hdr(skb);
|
||||
|
||||
if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
|
||||
INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
|
||||
inet_rsk(req)->ecn_ok = 1;
|
||||
}
|
||||
|
||||
/* Due to TSO, an SKB can be composed of multiple actual
|
||||
* packets. To keep these tracked properly, we use this.
|
||||
*/
|
||||
|
@ -780,8 +763,17 @@ enum tcp_ca_event {
|
|||
CA_EVENT_CWND_RESTART, /* congestion window restart */
|
||||
CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
|
||||
CA_EVENT_LOSS, /* loss timeout */
|
||||
CA_EVENT_FAST_ACK, /* in sequence ack */
|
||||
CA_EVENT_SLOW_ACK, /* other ack */
|
||||
CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
|
||||
CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
|
||||
CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
|
||||
CA_EVENT_NON_DELAYED_ACK,
|
||||
};
|
||||
|
||||
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
|
||||
enum tcp_ca_ack_event_flags {
|
||||
CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
|
||||
CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
|
||||
CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -791,7 +783,10 @@ enum tcp_ca_event {
|
|||
#define TCP_CA_MAX 128
|
||||
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
|
||||
|
||||
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
|
||||
#define TCP_CONG_NON_RESTRICTED 0x1
|
||||
/* Requires ECN/ECT set on all packets */
|
||||
#define TCP_CONG_NEEDS_ECN 0x2
|
||||
|
||||
struct tcp_congestion_ops {
|
||||
struct list_head list;
|
||||
|
@ -810,6 +805,8 @@ struct tcp_congestion_ops {
|
|||
void (*set_state)(struct sock *sk, u8 new_state);
|
||||
/* call when cwnd event occurs (optional) */
|
||||
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
|
||||
/* call when ack arrives (optional) */
|
||||
void (*in_ack_event)(struct sock *sk, u32 flags);
|
||||
/* new value of cwnd after loss (optional) */
|
||||
u32 (*undo_cwnd)(struct sock *sk);
|
||||
/* hook for packet ack accounting (optional) */
|
||||
|
@ -824,6 +821,7 @@ struct tcp_congestion_ops {
|
|||
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
|
||||
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
|
||||
|
||||
void tcp_assign_congestion_control(struct sock *sk);
|
||||
void tcp_init_congestion_control(struct sock *sk);
|
||||
void tcp_cleanup_congestion_control(struct sock *sk);
|
||||
int tcp_set_default_congestion_control(const char *name);
|
||||
|
@ -835,11 +833,17 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
|
|||
int tcp_slow_start(struct tcp_sock *tp, u32 acked);
|
||||
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
|
||||
|
||||
extern struct tcp_congestion_ops tcp_init_congestion_ops;
|
||||
u32 tcp_reno_ssthresh(struct sock *sk);
|
||||
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
|
||||
extern struct tcp_congestion_ops tcp_reno;
|
||||
|
||||
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
|
||||
}
|
||||
|
||||
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
@ -857,6 +861,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
|
|||
icsk->icsk_ca_ops->cwnd_event(sk, event);
|
||||
}
|
||||
|
||||
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
|
||||
*
|
||||
* If we receive a SYN packet with these bits set, it means a
|
||||
* network is playing bad games with TOS bits. In order to
|
||||
* avoid possible false congestion notifications, we disable
|
||||
* TCP ECN negociation.
|
||||
*
|
||||
* Exception: tcp_ca wants ECN. This is required for DCTCP
|
||||
* congestion control; it requires setting ECT on all packets,
|
||||
* including SYN. We inverse the test in this case: If our
|
||||
* local socket wants ECN, but peer only set ece/cwr (but not
|
||||
* ECT in IP header) its probably a non-DCTCP aware sender.
|
||||
*/
|
||||
static inline void
|
||||
TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
|
||||
const struct sock *listen_sk)
|
||||
{
|
||||
const struct tcphdr *th = tcp_hdr(skb);
|
||||
const struct net *net = sock_net(listen_sk);
|
||||
bool th_ecn = th->ece && th->cwr;
|
||||
bool ect, need_ecn;
|
||||
|
||||
if (!th_ecn)
|
||||
return;
|
||||
|
||||
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
|
||||
need_ecn = tcp_ca_needs_ecn(listen_sk);
|
||||
|
||||
if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
|
||||
inet_rsk(req)->ecn_ok = 1;
|
||||
else if (ect && need_ecn)
|
||||
inet_rsk(req)->ecn_ok = 1;
|
||||
}
|
||||
|
||||
/* These functions determine how the current flow behaves in respect of SACK
|
||||
* handling. SACK is negotiated with the peer, and therefore it can vary
|
||||
* between different flows.
|
||||
|
|
|
@ -110,10 +110,10 @@ enum {
|
|||
INET_DIAG_TCLASS,
|
||||
INET_DIAG_SKMEMINFO,
|
||||
INET_DIAG_SHUTDOWN,
|
||||
INET_DIAG_DCTCPINFO,
|
||||
};
|
||||
|
||||
#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
|
||||
|
||||
#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
|
||||
|
||||
/* INET_DIAG_MEM */
|
||||
|
||||
|
@ -133,5 +133,14 @@ struct tcpvegas_info {
|
|||
__u32 tcpv_minrtt;
|
||||
};
|
||||
|
||||
/* INET_DIAG_DCTCPINFO */
|
||||
|
||||
struct tcp_dctcp_info {
|
||||
__u16 dctcp_enabled;
|
||||
__u16 dctcp_ce_state;
|
||||
__u32 dctcp_alpha;
|
||||
__u32 dctcp_ab_ecn;
|
||||
__u32 dctcp_ab_tot;
|
||||
};
|
||||
|
||||
#endif /* _UAPI_INET_DIAG_H_ */
|
||||
|
|
|
@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS
|
|||
For further details see:
|
||||
http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
|
||||
|
||||
config TCP_CONG_DCTCP
|
||||
tristate "DataCenter TCP (DCTCP)"
|
||||
default n
|
||||
---help---
|
||||
DCTCP leverages Explicit Congestion Notification (ECN) in the network to
|
||||
provide multi-bit feedback to the end hosts. It is designed to provide:
|
||||
|
||||
- High burst tolerance (incast due to partition/aggregate),
|
||||
- Low latency (short flows, queries),
|
||||
- High throughput (continuous data updates, large file transfers) with
|
||||
commodity, shallow-buffered switches.
|
||||
|
||||
All switches in the data center network running DCTCP must support
|
||||
ECN marking and be configured for marking when reaching defined switch
|
||||
buffer thresholds. The default ECN marking threshold heuristic for
|
||||
DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
|
||||
(~100KB) at 10Gbps, but might need further careful tweaking.
|
||||
|
||||
For further details see:
|
||||
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
|
||||
|
||||
choice
|
||||
prompt "Default TCP congestion control"
|
||||
default DEFAULT_CUBIC
|
||||
|
@ -598,9 +619,11 @@ choice
|
|||
config DEFAULT_WESTWOOD
|
||||
bool "Westwood" if TCP_CONG_WESTWOOD=y
|
||||
|
||||
config DEFAULT_DCTCP
|
||||
bool "DCTCP" if TCP_CONG_DCTCP=y
|
||||
|
||||
config DEFAULT_RENO
|
||||
bool "Reno"
|
||||
|
||||
endchoice
|
||||
|
||||
endif
|
||||
|
@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG
|
|||
default "westwood" if DEFAULT_WESTWOOD
|
||||
default "veno" if DEFAULT_VENO
|
||||
default "reno" if DEFAULT_RENO
|
||||
default "dctcp" if DEFAULT_DCTCP
|
||||
default "cubic"
|
||||
|
||||
config TCP_MD5SIG
|
||||
|
|
|
@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
|
|||
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
|
||||
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
|
||||
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
|
||||
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
|
||||
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
|
||||
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
|
||||
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
|
||||
|
|
|
@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
|
|||
|
||||
tp->reordering = sysctl_tcp_reordering;
|
||||
tcp_enable_early_retrans(tp);
|
||||
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
|
||||
tcp_assign_congestion_control(sk);
|
||||
|
||||
tp->tsoffset = 0;
|
||||
|
||||
|
@ -3258,8 +3258,6 @@ void __init tcp_init(void)
|
|||
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
|
||||
|
||||
tcp_metrics_init();
|
||||
|
||||
tcp_register_congestion_control(&tcp_reno);
|
||||
|
||||
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
|
||||
tcp_tasklet_init();
|
||||
}
|
||||
|
|
|
@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
|
|||
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
|
||||
|
||||
/* Assign choice of congestion control. */
|
||||
void tcp_init_congestion_control(struct sock *sk)
|
||||
void tcp_assign_congestion_control(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct tcp_congestion_ops *ca;
|
||||
|
||||
/* if no choice made yet assign the current value set as default */
|
||||
if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
|
||||
if (try_module_get(ca->owner)) {
|
||||
icsk->icsk_ca_ops = ca;
|
||||
break;
|
||||
}
|
||||
|
||||
/* fallback to next available */
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
|
||||
if (likely(try_module_get(ca->owner))) {
|
||||
icsk->icsk_ca_ops = ca;
|
||||
goto out;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
/* Fallback to next available. The last really
|
||||
* guaranteed fallback is Reno from this list.
|
||||
*/
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Clear out private data before diag gets it and
|
||||
* the ca has not been initialized.
|
||||
*/
|
||||
if (ca->get_info)
|
||||
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
|
||||
}
|
||||
|
||||
void tcp_init_congestion_control(struct sock *sk)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_ca_ops->init)
|
||||
icsk->icsk_ca_ops->init(sk);
|
||||
|
@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
|
|||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
};
|
||||
|
||||
/* Initial congestion control used (until SYN)
|
||||
* really reno under another name so we can tell difference
|
||||
* during tcp_set_default_congestion_control
|
||||
*/
|
||||
struct tcp_congestion_ops tcp_init_congestion_ops = {
|
||||
.name = "",
|
||||
.owner = THIS_MODULE,
|
||||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
|
||||
|
|
|
@ -0,0 +1,344 @@
|
|||
/* DataCenter TCP (DCTCP) congestion control.
|
||||
*
|
||||
* http://simula.stanford.edu/~alizade/Site/DCTCP.html
|
||||
*
|
||||
* This is an implementation of DCTCP over Reno, an enhancement to the
|
||||
* TCP congestion control algorithm designed for data centers. DCTCP
|
||||
* leverages Explicit Congestion Notification (ECN) in the network to
|
||||
* provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
|
||||
* the following three data center transport requirements:
|
||||
*
|
||||
* - High burst tolerance (incast due to partition/aggregate)
|
||||
* - Low latency (short flows, queries)
|
||||
* - High throughput (continuous data updates, large file transfers)
|
||||
* with commodity shallow buffered switches
|
||||
*
|
||||
* The algorithm is described in detail in the following two papers:
|
||||
*
|
||||
* 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
|
||||
* Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
|
||||
* "Data Center TCP (DCTCP)", Data Center Networks session
|
||||
* Proc. ACM SIGCOMM, New Delhi, 2010.
|
||||
* http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
|
||||
*
|
||||
* 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
|
||||
* "Analysis of DCTCP: Stability, Convergence, and Fairness"
|
||||
* Proc. ACM SIGMETRICS, San Jose, 2011.
|
||||
* http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
|
||||
*
|
||||
* Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
|
||||
*
|
||||
* Authors:
|
||||
*
|
||||
* Daniel Borkmann <dborkman@redhat.com>
|
||||
* Florian Westphal <fw@strlen.de>
|
||||
* Glenn Judd <glenn.judd@morganstanley.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or (at
|
||||
* your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <net/tcp.h>
|
||||
#include <linux/inet_diag.h>
|
||||
|
||||
#define DCTCP_MAX_ALPHA 1024U
|
||||
|
||||
struct dctcp {
|
||||
u32 acked_bytes_ecn;
|
||||
u32 acked_bytes_total;
|
||||
u32 prior_snd_una;
|
||||
u32 prior_rcv_nxt;
|
||||
u32 dctcp_alpha;
|
||||
u32 next_seq;
|
||||
u32 ce_state;
|
||||
u32 delayed_ack_reserved;
|
||||
};
|
||||
|
||||
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
|
||||
module_param(dctcp_shift_g, uint, 0644);
|
||||
MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
|
||||
|
||||
static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
|
||||
module_param(dctcp_alpha_on_init, uint, 0644);
|
||||
MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
|
||||
|
||||
static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
|
||||
module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
|
||||
MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
|
||||
"parameter for clamping alpha on loss");
|
||||
|
||||
static struct tcp_congestion_ops dctcp_reno;
|
||||
|
||||
static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
|
||||
{
|
||||
ca->next_seq = tp->snd_nxt;
|
||||
|
||||
ca->acked_bytes_ecn = 0;
|
||||
ca->acked_bytes_total = 0;
|
||||
}
|
||||
|
||||
static void dctcp_init(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if ((tp->ecn_flags & TCP_ECN_OK) ||
|
||||
(sk->sk_state == TCP_LISTEN ||
|
||||
sk->sk_state == TCP_CLOSE)) {
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->prior_snd_una = tp->snd_una;
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
|
||||
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
|
||||
|
||||
ca->delayed_ack_reserved = 0;
|
||||
ca->ce_state = 0;
|
||||
|
||||
dctcp_reset(tp, ca);
|
||||
return;
|
||||
}
|
||||
|
||||
/* No ECN support? Fall back to Reno. Also need to clear
|
||||
* ECT from sk since it is set during 3WHS for DCTCP.
|
||||
*/
|
||||
inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
|
||||
INET_ECN_dontxmit(sk);
|
||||
}
|
||||
|
||||
static u32 dctcp_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct dctcp *ca = inet_csk_ca(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
|
||||
}
|
||||
|
||||
/* Minimal DCTP CE state machine:
|
||||
*
|
||||
* S: 0 <- last pkt was non-CE
|
||||
* 1 <- last pkt was CE
|
||||
*/
|
||||
|
||||
static void dctcp_ce_state_0_to_1(struct sock *sk)
|
||||
{
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
/* State has changed from CE=0 to CE=1 and delayed
|
||||
* ACK has not sent yet.
|
||||
*/
|
||||
if (!ca->ce_state && ca->delayed_ack_reserved) {
|
||||
u32 tmp_rcv_nxt;
|
||||
|
||||
/* Save current rcv_nxt. */
|
||||
tmp_rcv_nxt = tp->rcv_nxt;
|
||||
|
||||
/* Generate previous ack with CE=0. */
|
||||
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
tp->rcv_nxt = ca->prior_rcv_nxt;
|
||||
|
||||
tcp_send_ack(sk);
|
||||
|
||||
/* Recover current rcv_nxt. */
|
||||
tp->rcv_nxt = tmp_rcv_nxt;
|
||||
}
|
||||
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
ca->ce_state = 1;
|
||||
|
||||
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
}
|
||||
|
||||
static void dctcp_ce_state_1_to_0(struct sock *sk)
|
||||
{
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
/* State has changed from CE=1 to CE=0 and delayed
|
||||
* ACK has not sent yet.
|
||||
*/
|
||||
if (ca->ce_state && ca->delayed_ack_reserved) {
|
||||
u32 tmp_rcv_nxt;
|
||||
|
||||
/* Save current rcv_nxt. */
|
||||
tmp_rcv_nxt = tp->rcv_nxt;
|
||||
|
||||
/* Generate previous ack with CE=1. */
|
||||
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
tp->rcv_nxt = ca->prior_rcv_nxt;
|
||||
|
||||
tcp_send_ack(sk);
|
||||
|
||||
/* Recover current rcv_nxt. */
|
||||
tp->rcv_nxt = tmp_rcv_nxt;
|
||||
}
|
||||
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
ca->ce_state = 0;
|
||||
|
||||
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
}
|
||||
|
||||
static void dctcp_update_alpha(struct sock *sk, u32 flags)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
|
||||
|
||||
/* If ack did not advance snd_una, count dupack as MSS size.
|
||||
* If ack did update window, do not count it at all.
|
||||
*/
|
||||
if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
|
||||
acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
|
||||
if (acked_bytes) {
|
||||
ca->acked_bytes_total += acked_bytes;
|
||||
ca->prior_snd_una = tp->snd_una;
|
||||
|
||||
if (flags & CA_ACK_ECE)
|
||||
ca->acked_bytes_ecn += acked_bytes;
|
||||
}
|
||||
|
||||
/* Expired RTT */
|
||||
if (!before(tp->snd_una, ca->next_seq)) {
|
||||
/* For avoiding denominator == 1. */
|
||||
if (ca->acked_bytes_total == 0)
|
||||
ca->acked_bytes_total = 1;
|
||||
|
||||
/* alpha = (1 - g) * alpha + g * F */
|
||||
ca->dctcp_alpha = ca->dctcp_alpha -
|
||||
(ca->dctcp_alpha >> dctcp_shift_g) +
|
||||
(ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
|
||||
ca->acked_bytes_total;
|
||||
|
||||
if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
|
||||
/* Clamp dctcp_alpha to max. */
|
||||
ca->dctcp_alpha = DCTCP_MAX_ALPHA;
|
||||
|
||||
dctcp_reset(tp, ca);
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_state(struct sock *sk, u8 new_state)
|
||||
{
|
||||
if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
/* If this extension is enabled, we clamp dctcp_alpha to
|
||||
* max on packet loss; the motivation is that dctcp_alpha
|
||||
* is an indicator to the extend of congestion and packet
|
||||
* loss is an indicator of extreme congestion; setting
|
||||
* this in practice turned out to be beneficial, and
|
||||
* effectively assumes total congestion which reduces the
|
||||
* window by half.
|
||||
*/
|
||||
ca->dctcp_alpha = DCTCP_MAX_ALPHA;
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
|
||||
{
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
switch (ev) {
|
||||
case CA_EVENT_DELAYED_ACK:
|
||||
if (!ca->delayed_ack_reserved)
|
||||
ca->delayed_ack_reserved = 1;
|
||||
break;
|
||||
case CA_EVENT_NON_DELAYED_ACK:
|
||||
if (ca->delayed_ack_reserved)
|
||||
ca->delayed_ack_reserved = 0;
|
||||
break;
|
||||
default:
|
||||
/* Don't care for the rest. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
|
||||
{
|
||||
switch (ev) {
|
||||
case CA_EVENT_ECN_IS_CE:
|
||||
dctcp_ce_state_0_to_1(sk);
|
||||
break;
|
||||
case CA_EVENT_ECN_NO_CE:
|
||||
dctcp_ce_state_1_to_0(sk);
|
||||
break;
|
||||
case CA_EVENT_DELAYED_ACK:
|
||||
case CA_EVENT_NON_DELAYED_ACK:
|
||||
dctcp_update_ack_reserved(sk, ev);
|
||||
break;
|
||||
default:
|
||||
/* Don't care for the rest. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
|
||||
{
|
||||
const struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
/* Fill it also in case of VEGASINFO due to req struct limits.
|
||||
* We can still correctly retrieve it later.
|
||||
*/
|
||||
if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
|
||||
ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
|
||||
struct tcp_dctcp_info info;
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
|
||||
info.dctcp_enabled = 1;
|
||||
info.dctcp_ce_state = (u16) ca->ce_state;
|
||||
info.dctcp_alpha = ca->dctcp_alpha;
|
||||
info.dctcp_ab_ecn = ca->acked_bytes_ecn;
|
||||
info.dctcp_ab_tot = ca->acked_bytes_total;
|
||||
}
|
||||
|
||||
nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops dctcp __read_mostly = {
|
||||
.init = dctcp_init,
|
||||
.in_ack_event = dctcp_update_alpha,
|
||||
.cwnd_event = dctcp_cwnd_event,
|
||||
.ssthresh = dctcp_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
.set_state = dctcp_state,
|
||||
.get_info = dctcp_get_info,
|
||||
.flags = TCP_CONG_NEEDS_ECN,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "dctcp",
|
||||
};
|
||||
|
||||
static struct tcp_congestion_ops dctcp_reno __read_mostly = {
|
||||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
.get_info = dctcp_get_info,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "dctcp-reno",
|
||||
};
|
||||
|
||||
static int __init dctcp_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
|
||||
return tcp_register_congestion_control(&dctcp);
|
||||
}
|
||||
|
||||
static void __exit dctcp_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&dctcp);
|
||||
}
|
||||
|
||||
module_init(dctcp_register);
|
||||
module_exit(dctcp_unregister);
|
||||
|
||||
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
|
||||
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
|
||||
MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
|
|
@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
|
|||
tcp_enter_quickack_mode((struct sock *)tp);
|
||||
break;
|
||||
case INET_ECN_CE:
|
||||
if (tcp_ca_needs_ecn((struct sock *)tp))
|
||||
tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
|
||||
|
||||
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
|
||||
/* Better not delay acks, sender can have a very low cwnd */
|
||||
tcp_enter_quickack_mode((struct sock *)tp);
|
||||
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
}
|
||||
/* fallinto */
|
||||
default:
|
||||
tp->ecn_flags |= TCP_ECN_SEEN;
|
||||
break;
|
||||
default:
|
||||
if (tcp_ca_needs_ecn((struct sock *)tp))
|
||||
tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
|
||||
tp->ecn_flags |= TCP_ECN_SEEN;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3362,6 +3369,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
|
|||
}
|
||||
}
|
||||
|
||||
static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_ca_ops->in_ack_event)
|
||||
icsk->icsk_ca_ops->in_ack_event(sk, flags);
|
||||
}
|
||||
|
||||
/* This routine deals with incoming acks, but not outgoing ones. */
|
||||
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
||||
{
|
||||
|
@ -3421,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
|||
tp->snd_una = ack;
|
||||
flag |= FLAG_WIN_UPDATE;
|
||||
|
||||
tcp_ca_event(sk, CA_EVENT_FAST_ACK);
|
||||
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
|
||||
|
||||
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
|
||||
} else {
|
||||
u32 ack_ev_flags = CA_ACK_SLOWPATH;
|
||||
|
||||
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
|
||||
flag |= FLAG_DATA;
|
||||
else
|
||||
|
@ -3436,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
|||
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
|
||||
&sack_rtt_us);
|
||||
|
||||
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
|
||||
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
|
||||
flag |= FLAG_ECE;
|
||||
ack_ev_flags |= CA_ACK_ECE;
|
||||
}
|
||||
|
||||
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
|
||||
if (flag & FLAG_WIN_UPDATE)
|
||||
ack_ev_flags |= CA_ACK_WIN_UPDATE;
|
||||
|
||||
tcp_in_ack_event(sk, ack_ev_flags);
|
||||
}
|
||||
|
||||
/* We passed data and got it acked, remove any soft error
|
||||
|
@ -5944,7 +5966,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|||
goto drop_and_free;
|
||||
|
||||
if (!want_cookie || tmp_opt.tstamp_ok)
|
||||
TCP_ECN_create_request(req, skb, sock_net(sk));
|
||||
TCP_ECN_create_request(req, skb, sk);
|
||||
|
||||
if (want_cookie) {
|
||||
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
|
||||
|
|
|
@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
|
|||
newtp->snd_cwnd = TCP_INIT_CWND;
|
||||
newtp->snd_cwnd_cnt = 0;
|
||||
|
||||
if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
|
||||
!try_module_get(newicsk->icsk_ca_ops->owner))
|
||||
newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
|
||||
if (!try_module_get(newicsk->icsk_ca_ops->owner))
|
||||
tcp_assign_congestion_control(newsk);
|
||||
|
||||
tcp_set_ca_state(newsk, TCP_CA_Open);
|
||||
tcp_init_xmit_timers(newsk);
|
||||
|
|
|
@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
|
|||
}
|
||||
|
||||
/* Packet ECN state for a SYN-ACK */
|
||||
static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
|
||||
static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
|
||||
if (!(tp->ecn_flags & TCP_ECN_OK))
|
||||
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
|
||||
else if (tcp_ca_needs_ecn(sk))
|
||||
INET_ECN_xmit(sk);
|
||||
}
|
||||
|
||||
/* Packet ECN state for a SYN. */
|
||||
|
@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
|
|||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
tp->ecn_flags = 0;
|
||||
if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
|
||||
if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
|
||||
tcp_ca_needs_ecn(sk)) {
|
||||
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
|
||||
tp->ecn_flags = TCP_ECN_OK;
|
||||
if (tcp_ca_needs_ecn(sk))
|
||||
INET_ECN_xmit(sk);
|
||||
}
|
||||
}
|
||||
|
||||
static __inline__ void
|
||||
TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
|
||||
TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
|
||||
struct sock *sk)
|
||||
{
|
||||
if (inet_rsk(req)->ecn_ok)
|
||||
if (inet_rsk(req)->ecn_ok) {
|
||||
th->ece = 1;
|
||||
if (tcp_ca_needs_ecn(sk))
|
||||
INET_ECN_xmit(sk);
|
||||
}
|
||||
}
|
||||
|
||||
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
|
||||
|
@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
|
|||
tcp_hdr(skb)->cwr = 1;
|
||||
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
|
||||
}
|
||||
} else {
|
||||
} else if (!tcp_ca_needs_ecn(sk)) {
|
||||
/* ACK or retransmitted segment: clear ECT|CE */
|
||||
INET_ECN_dontxmit(sk);
|
||||
}
|
||||
|
@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
|
|||
}
|
||||
|
||||
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
|
||||
TCP_ECN_send_synack(tcp_sk(sk), skb);
|
||||
TCP_ECN_send_synack(sk, skb);
|
||||
}
|
||||
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
|
||||
}
|
||||
|
@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
|
|||
memset(th, 0, sizeof(struct tcphdr));
|
||||
th->syn = 1;
|
||||
th->ack = 1;
|
||||
TCP_ECN_make_synack(req, th);
|
||||
TCP_ECN_make_synack(req, th, sk);
|
||||
th->source = htons(ireq->ir_num);
|
||||
th->dest = ireq->ir_rmt_port;
|
||||
/* Setting of flags are superfluous here for callers (and ECE is
|
||||
|
@ -3119,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
|
|||
int ato = icsk->icsk_ack.ato;
|
||||
unsigned long timeout;
|
||||
|
||||
tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
|
||||
|
||||
if (ato > TCP_DELACK_MIN) {
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
int max_ato = HZ / 2;
|
||||
|
@ -3175,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
|
|||
if (sk->sk_state == TCP_CLOSE)
|
||||
return;
|
||||
|
||||
tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
|
||||
|
||||
/* We are not putting this on the write queue, so
|
||||
* tcp_transmit_skb() will set the ownership to this
|
||||
* sock.
|
||||
|
@ -3196,6 +3211,7 @@ void tcp_send_ack(struct sock *sk)
|
|||
skb_mstamp_get(&buff->skb_mstamp);
|
||||
tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_send_ack);
|
||||
|
||||
/* This routine sends a packet with an out of date sequence
|
||||
* number. It assumes the other end will try to ack it.
|
||||
|
|
|
@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
|
|||
return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
|
||||
}
|
||||
|
||||
static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
|
||||
{
|
||||
if (ack_flags & CA_ACK_SLOWPATH) {
|
||||
struct westwood *w = inet_csk_ca(sk);
|
||||
|
||||
westwood_update_window(sk);
|
||||
w->bk += westwood_acked_count(sk);
|
||||
|
||||
update_rtt_min(w);
|
||||
return;
|
||||
}
|
||||
|
||||
westwood_fast_bw(sk);
|
||||
}
|
||||
|
||||
static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct westwood *w = inet_csk_ca(sk);
|
||||
|
||||
switch (event) {
|
||||
case CA_EVENT_FAST_ACK:
|
||||
westwood_fast_bw(sk);
|
||||
break;
|
||||
|
||||
case CA_EVENT_COMPLETE_CWR:
|
||||
tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
|
||||
break;
|
||||
|
||||
case CA_EVENT_LOSS:
|
||||
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
|
||||
/* Update RTT_min when next ack arrives */
|
||||
w->reset_rtt_min = 1;
|
||||
break;
|
||||
|
||||
case CA_EVENT_SLOW_ACK:
|
||||
westwood_update_window(sk);
|
||||
w->bk += westwood_acked_count(sk);
|
||||
update_rtt_min(w);
|
||||
break;
|
||||
|
||||
default:
|
||||
/* don't care */
|
||||
break;
|
||||
|
@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
|
|||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
.cwnd_event = tcp_westwood_event,
|
||||
.in_ack_event = tcp_westwood_ack,
|
||||
.get_info = tcp_westwood_info,
|
||||
.pkts_acked = tcp_westwood_pkts_acked,
|
||||
|
||||
|
|
Loading…
Reference in New Issue