From 55d8694fa82c9b5858ae5a78a210353961f908f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 26 Sep 2014 22:37:32 +0200
Subject: [PATCH 1/5] net: tcp: assign tcp cong_ops when tcp sk is created

Split assignment and initialization from one into two functions.

This is required by followup patches that add Datacenter TCP
(DCTCP) congestion control algorithm - we need to be able to
determine if the connection is moderated by DCTCP before the
3WHS has finished.

As we walk the available congestion control list during the
assignment, we are always guaranteed to have Reno present as
it's fixed compiled-in. Therefore, since we're doing the
early assignment, we don't have a real use for the Reno alias
tcp_init_congestion_ops anymore and can thus remove it.

Actual usage of the congestion control operations are being
made after the 3WHS has finished, in some cases however we
can access get_info() via diag if implemented, therefore we
need to zero out the private area for those modules.

Joint work with Daniel Borkmann and Glenn Judd.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  2 +-
 net/ipv4/tcp.c           |  6 ++----
 net/ipv4/tcp_cong.c      | 46 +++++++++++++++++++---------------------
 net/ipv4/tcp_minisocks.c |  5 ++---
 4 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 02a9a2c366bf..f99b0c072ee5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -824,6 +824,7 @@ struct tcp_congestion_ops {
 int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
 
+void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
 void tcp_cleanup_congestion_control(struct sock *sk);
 int tcp_set_default_congestion_control(const char *name);
@@ -835,7 +836,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
 int tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
 
-extern struct tcp_congestion_ops tcp_init_congestion_ops;
 u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87289e51be00..cf5e508e1ef5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
 
 	tp->reordering = sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
-	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
 
@@ -3258,8 +3258,6 @@ void __init tcp_init(void)
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
 	tcp_metrics_init();
-
-	tcp_register_congestion_control(&tcp_reno);
-
+	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
 	tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 80248f56c89f..a6c8a5775624 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
 /* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct sock *sk)
+void tcp_assign_congestion_control(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_congestion_ops *ca;
 
-	/* if no choice made yet assign the current value set as default */
-	if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
-		rcu_read_lock();
-		list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
-			if (try_module_get(ca->owner)) {
-				icsk->icsk_ca_ops = ca;
-				break;
-			}
-
-			/* fallback to next available */
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (likely(try_module_get(ca->owner))) {
+			icsk->icsk_ca_ops = ca;
+			goto out;
 		}
-		rcu_read_unlock();
+		/* Fallback to next available. The last really
+		 * guaranteed fallback is Reno from this list.
+		 */
 	}
+out:
+	rcu_read_unlock();
+
+	/* Clear out private data before diag gets it and
+	 * the ca has not been initialized.
+	 */
+	if (ca->get_info)
+		memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_reno_cong_avoid,
 };
-
-/* Initial congestion control used (until SYN)
- * really reno under another name so we can tell difference
- * during tcp_set_default_congestion_control
- */
-struct tcp_congestion_ops tcp_init_congestion_ops  = {
-	.name		= "",
-	.owner		= THIS_MODULE,
-	.ssthresh	= tcp_reno_ssthresh,
-	.cong_avoid	= tcp_reno_cong_avoid,
-};
-EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a058f411d3a6..47b73506b77e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
-		if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
-		    !try_module_get(newicsk->icsk_ca_ops->owner))
-			newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+		if (!try_module_get(newicsk->icsk_ca_ops->owner))
+			tcp_assign_congestion_control(newsk);
 
 		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);

From 30e502a34b8b21fae2c789da102bd9f6e99fef83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 26 Sep 2014 22:37:33 +0200
Subject: [PATCH 2/5] net: tcp: add flag for ca to indicate that ECN is
 required

This patch adds a flag to TCP congestion algorithms that allows
for requesting to mark IPv4/IPv6 sockets with transport as ECN
capable, that is, ECT(0), when required by a congestion algorithm.

It is currently used and needed in DataCenter TCP (DCTCP), as it
requires both peers to assert ECT on all IP packets sent - it
uses ECN feedback (i.e. CE, Congestion Encountered information)
from switches inside the data center to derive feedback to the
end hosts.

Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's
algorithm/behaviour slightly diverges from RFC3168, therefore this
is only (!) enabled iff the assigned congestion control ops module
has requested this. By that, we can tightly couple this logic really
only to the provided congestion control ops.

Joint work with Florian Westphal and Glenn Judd.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 61 +++++++++++++++++++++++++++++++------------
 net/ipv4/tcp_input.c  |  2 +-
 net/ipv4/tcp_output.c | 25 +++++++++++++-----
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f99b0c072ee5..a12f145cfbc3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
-/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
- *
- * If we receive a SYN packet with these bits set, it means a network is
- * playing bad games with TOS bits. In order to avoid possible false congestion
- * notifications, we disable TCP ECN negociation.
- */
-static inline void
-TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
-		struct net *net)
-{
-	const struct tcphdr *th = tcp_hdr(skb);
-
-	if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
-	    INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
-		inet_rsk(req)->ecn_ok = 1;
-}
-
 /* Due to TSO, an SKB can be composed of multiple actual
  * packets.  To keep these tracked properly, we use this.
  */
@@ -791,7 +774,10 @@ enum tcp_ca_event {
 #define TCP_CA_MAX	128
 #define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)
 
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
 #define TCP_CONG_NON_RESTRICTED 0x1
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN	0x2
 
 struct tcp_congestion_ops {
 	struct list_head	list;
@@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
+static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
+}
+
 static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 		icsk->icsk_ca_ops->cwnd_event(sk, event);
 }
 
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static inline void
+TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
+		       const struct sock *listen_sk)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	const struct net *net = sock_net(listen_sk);
+	bool th_ecn = th->ece && th->cwr;
+	bool ect, need_ecn;
+
+	if (!th_ecn)
+		return;
+
+	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+	need_ecn = tcp_ca_needs_ecn(listen_sk);
+
+	if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+		inet_rsk(req)->ecn_ok = 1;
+	else if (ect && need_ecn)
+		inet_rsk(req)->ecn_ok = 1;
+}
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5073eefa6fae..fb0fe97e1c54 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		goto drop_and_free;
 
 	if (!want_cookie || tmp_opt.tstamp_ok)
-		TCP_ECN_create_request(req, skb, sock_net(sk));
+		TCP_ECN_create_request(req, skb, sk);
 
 	if (want_cookie) {
 		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4d92703df4c6..20e73271d75c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
 }
 
 /* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
+
 	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
 	if (!(tp->ecn_flags & TCP_ECN_OK))
 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+	else if (tcp_ca_needs_ecn(sk))
+		INET_ECN_xmit(sk);
 }
 
 /* Packet ECN state for a SYN.  */
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->ecn_flags = 0;
-	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+	    tcp_ca_needs_ecn(sk)) {
 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
 		tp->ecn_flags = TCP_ECN_OK;
+		if (tcp_ca_needs_ecn(sk))
+			INET_ECN_xmit(sk);
 	}
 }
 
 static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
+		    struct sock *sk)
 {
-	if (inet_rsk(req)->ecn_ok)
+	if (inet_rsk(req)->ecn_ok) {
 		th->ece = 1;
+		if (tcp_ca_needs_ecn(sk))
+			INET_ECN_xmit(sk);
+	}
 }
 
 /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
 				tcp_hdr(skb)->cwr = 1;
 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 			}
-		} else {
+		} else if (!tcp_ca_needs_ecn(sk)) {
 			/* ACK or retransmitted segment: clear ECT|CE */
 			INET_ECN_dontxmit(sk);
 		}
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
 		}
 
 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
-		TCP_ECN_send_synack(tcp_sk(sk), skb);
+		TCP_ECN_send_synack(sk, skb);
 	}
 	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	memset(th, 0, sizeof(struct tcphdr));
 	th->syn = 1;
 	th->ack = 1;
-	TCP_ECN_make_synack(req, th);
+	TCP_ECN_make_synack(req, th, sk);
 	th->source = htons(ireq->ir_num);
 	th->dest = ireq->ir_rmt_port;
 	/* Setting of flags are superfluous here for callers (and ECE is

From 7354c8c389d18719dd71cc810da70b0921d66694 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 26 Sep 2014 22:37:34 +0200
Subject: [PATCH 3/5] net: tcp: split ack slow/fast events from cwnd_event

The congestion control ops "cwnd_event" currently supports
CA_EVENT_FAST_ACK and CA_EVENT_SLOW_ACK events (among others).
Both FAST and SLOW_ACK are only used by Westwood congestion
control algorithm.

This removes both flags from cwnd_event and adds a new
in_ack_event callback for this. The goal is to be able to
provide more detailed information about ACKs, such as whether
ECE flag was set, or whether the ACK resulted in a window
update.

It is required for DataCenter TCP (DCTCP) congestion control
algorithm as it makes a different choice depending on ECE being
set or not.

Joint work with Daniel Borkmann and Glenn Judd.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  8 ++++++--
 net/ipv4/tcp_input.c    | 12 ++++++++++--
 net/ipv4/tcp_westwood.c | 28 ++++++++++++++++------------
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a12f145cfbc3..7ec6a28305c0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,8 +763,10 @@ enum tcp_ca_event {
 	CA_EVENT_CWND_RESTART,	/* congestion window restart */
 	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
 	CA_EVENT_LOSS,		/* loss timeout */
-	CA_EVENT_FAST_ACK,	/* in sequence ack */
-	CA_EVENT_SLOW_ACK,	/* other ack */
+};
+
+enum tcp_ca_ack_event_flags {
+	CA_ACK_SLOWPATH = (1 << 0),
 };
 
 /*
@@ -796,6 +798,8 @@ struct tcp_congestion_ops {
 	void (*set_state)(struct sock *sk, u8 new_state);
 	/* call when cwnd event occurs (optional) */
 	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+	/* call when ack arrives (optional) */
+	void (*in_ack_event)(struct sock *sk, u32 flags);
 	/* new value of cwnd after loss (optional) */
 	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb0fe97e1c54..8a38774cc66e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3362,6 +3362,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 	}
 }
 
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->in_ack_event)
+		icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3421,7 +3429,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tp->snd_una = ack;
 		flag |= FLAG_WIN_UPDATE;
 
-		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+		tcp_in_ack_event(sk, 0);
 
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
 	} else {
@@ -3439,7 +3447,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
 			flag |= FLAG_ECE;
 
-		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+		tcp_in_ack_event(sk, CA_ACK_SLOWPATH);
 	}
 
 	/* We passed data and got it acked, remove any soft error
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 81911a92356c..bb63fba47d47 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+	if (ack_flags & CA_ACK_SLOWPATH) {
+		struct westwood *w = inet_csk_ca(sk);
+
+		westwood_update_window(sk);
+		w->bk += westwood_acked_count(sk);
+
+		update_rtt_min(w);
+		return;
+	}
+
+	westwood_fast_bw(sk);
+}
+
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct westwood *w = inet_csk_ca(sk);
 
 	switch (event) {
-	case CA_EVENT_FAST_ACK:
-		westwood_fast_bw(sk);
-		break;
-
 	case CA_EVENT_COMPLETE_CWR:
 		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
-
 	case CA_EVENT_LOSS:
 		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		/* Update RTT_min when next ack arrives */
 		w->reset_rtt_min = 1;
 		break;
-
-	case CA_EVENT_SLOW_ACK:
-		westwood_update_window(sk);
-		w->bk += westwood_acked_count(sk);
-		update_rtt_min(w);
-		break;
-
 	default:
 		/* don't care */
 		break;
@@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_reno_cong_avoid,
 	.cwnd_event	= tcp_westwood_event,
+	.in_ack_event	= tcp_westwood_ack,
 	.get_info	= tcp_westwood_info,
 	.pkts_acked	= tcp_westwood_pkts_acked,
 

From 9890092e46b2996bb85f7f973e69424cb5c07bc0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 26 Sep 2014 22:37:35 +0200
Subject: [PATCH 4/5] net: tcp: more detailed ACK events and events for CE
 marked packets

DataCenter TCP (DCTCP) determines cwnd growth based on ECN information
and ACK properties, e.g. ACK that updates window is treated differently
than DUPACK.

Also DCTCP needs information whether ACK was delayed ACK. Furthermore,
DCTCP also implements a CE state machine that keeps track of CE markings
of incoming packets.

Therefore, extend the congestion control framework to provide these
event types, so that DCTCP can be properly implemented as a normal
congestion algorithm module outside of the core stack.

Joint work with Daniel Borkmann and Glenn Judd.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  9 ++++++++-
 net/ipv4/tcp_input.c  | 24 +++++++++++++++++++-----
 net/ipv4/tcp_output.c |  4 ++++
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7ec6a28305c0..1f57c5363492 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,10 +763,17 @@ enum tcp_ca_event {
 	CA_EVENT_CWND_RESTART,	/* congestion window restart */
 	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
 	CA_EVENT_LOSS,		/* loss timeout */
+	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
+	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
+	CA_EVENT_NON_DELAYED_ACK,
 };
 
+/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
 enum tcp_ca_ack_event_flags {
-	CA_ACK_SLOWPATH = (1 << 0),
+	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
+	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
+	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
 };
 
 /*
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8a38774cc66e..fc133178c787 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
 			tcp_enter_quickack_mode((struct sock *)tp);
 		break;
 	case INET_ECN_CE:
+		if (tcp_ca_needs_ecn((struct sock *)tp))
+			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
 			tcp_enter_quickack_mode((struct sock *)tp);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
-		/* fallinto */
-	default:
 		tp->ecn_flags |= TCP_ECN_SEEN;
+		break;
+	default:
+		if (tcp_ca_needs_ecn((struct sock *)tp))
+			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+		tp->ecn_flags |= TCP_ECN_SEEN;
+		break;
 	}
 }
 
@@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tp->snd_una = ack;
 		flag |= FLAG_WIN_UPDATE;
 
-		tcp_in_ack_event(sk, 0);
+		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
 
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
 	} else {
+		u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
 			flag |= FLAG_DATA;
 		else
@@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 							&sack_rtt_us);
 
-		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
 			flag |= FLAG_ECE;
+			ack_ev_flags |= CA_ACK_ECE;
+		}
 
-		tcp_in_ack_event(sk, CA_ACK_SLOWPATH);
+		if (flag & FLAG_WIN_UPDATE)
+			ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+		tcp_in_ack_event(sk, ack_ev_flags);
 	}
 
 	/* We passed data and got it acked, remove any soft error
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20e73271d75c..124f9e4e4594 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk)
 	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
+	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
 	if (ato > TCP_DELACK_MIN) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ / 2;
@@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk)
 	if (sk->sk_state == TCP_CLOSE)
 		return;
 
+	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
 	/* We are not putting this on the write queue, so
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.

From e3118e8359bb7c59555aca60c725106e6d78c5ce Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 26 Sep 2014 22:37:36 +0200
Subject: [PATCH 5/5] net: tcp: add DCTCP congestion control algorithm

This work adds the DataCenter TCP (DCTCP) congestion control
algorithm [1], which has been first published at SIGCOMM 2010 [2],
resp. follow-up analysis at SIGMETRICS 2011 [3] (and also, more
recently as an informational IETF draft available at [4]).

DCTCP is an enhancement to the TCP congestion control algorithm for
data center networks. Typical data center workloads are i.e.
i) partition/aggregate (queries; bursty, delay sensitive), ii) short
messages e.g. 50KB-1MB (for coordination and control state; delay
sensitive), and iii) large flows e.g. 1MB-100MB (data update;
throughput sensitive). DCTCP has therefore been designed for such
environments to provide/achieve the following three requirements:

  * High burst tolerance (incast due to partition/aggregate)
  * Low latency (short flows, queries)
  * High throughput (continuous data updates, large file
    transfers) with commodity, shallow buffered switches

The basic idea of its design consists of two fundamentals: i) on the
switch side, packets are being marked when its internal queue
length > threshold K (K is chosen so that a large enough headroom
for marked traffic is still available in the switch queue); ii) the
sender/host side maintains a moving average of the fraction of marked
packets, so each RTT, F is being updated as follows:

 F := X / Y, where X is # of marked ACKs, Y is total # of ACKs
 alpha := (1 - g) * alpha + g * F, where g is a smoothing constant

The resulting alpha (iow: probability that switch queue is congested)
is then being used in order to adaptively decrease the congestion
window W:

 W := (1 - (alpha / 2)) * W

The means for receiving marked packets resp. marking them on switch
side in DCTCP is the use of ECN.

RFC3168 describes a mechanism for using Explicit Congestion Notification
from the switch for early detection of congestion, rather than waiting
for segment loss to occur.

However, this method only detects the presence of congestion, not
the *extent*. In the presence of mild congestion, it reduces the TCP
congestion window too aggressively and unnecessarily affects the
throughput of long flows [4].

DCTCP, as mentioned, enhances Explicit Congestion Notification (ECN)
processing to estimate the fraction of bytes that encounter congestion,
rather than simply detecting that some congestion has occurred. DCTCP
then scales the TCP congestion window based on this estimate [4],
thus it can derive multibit feedback from the information present in
the single-bit sequence of marks in its control law. And thus act in
*proportion* to the extent of congestion, not its *presence*.

Switches therefore set the Congestion Experienced (CE) codepoint in
packets when internal queue lengths exceed threshold K. Resulting,
DCTCP delivers the same or better throughput than normal TCP, while
using 90% less buffer space.

It was found in [2] that DCTCP enables the applications to handle 10x
the current background traffic, without impacting foreground traffic.
Moreover, a 10x increase in foreground traffic did not cause any
timeouts, and thus largely eliminates TCP incast collapse problems.

The algorithm itself has already seen deployments in large production
data centers since then.

We did a long-term stress-test and analysis in a data center, short
summary of our TCP incast tests with iperf compared to cubic:

This test measured DCTCP throughput and latency and compared it with
CUBIC throughput and latency for an incast scenario. In this test, 19
senders sent at maximum rate to a single receiver. The receiver simply
ran iperf -s.

The senders ran iperf -c <receiver> -t 30. All senders started
simultaneously (using local clocks synchronized by ntp).

This test was repeated multiple times. Below shows the results from a
single test. Other tests are similar. (DCTCP results were extremely
consistent, CUBIC results show some variance induced by the TCP timeouts
that CUBIC encountered.)

For this test, we report statistics on the number of TCP timeouts,
flow throughput, and traffic latency.

1) Timeouts (total over all flows, and per flow summaries):

            CUBIC            DCTCP
  Total     3227             25
  Mean       169.842          1.316
  Median     183              1
  Max        207              5
  Min        123              0
  Stddev      28.991          1.600

Timeout data is taken by measuring the net change in netstat -s
"other TCP timeouts" reported. As a result, the timeout measurements
above are not restricted to the test traffic, and we believe that it
is likely that all of the "DCTCP timeouts" are actually timeouts for
non-test traffic. We report them nevertheless. CUBIC will also include
some non-test timeouts, but they are drawfed by bona fide test traffic
timeouts for CUBIC. Clearly DCTCP does an excellent job of preventing
TCP timeouts. DCTCP reduces timeouts by at least two orders of
magnitude and may well have eliminated them in this scenario.

2) Throughput (per flow in Mbps):

            CUBIC            DCTCP
  Mean      521.684          521.895
  Median    464              523
  Max       776              527
  Min       403              519
  Stddev    105.891            2.601
  Fairness    0.962            0.999

Throughput data was simply the average throughput for each flow
reported by iperf. By avoiding TCP timeouts, DCTCP is able to
achieve much better per-flow results. In CUBIC, many flows
experience TCP timeouts which makes flow throughput unpredictable and
unfair. DCTCP, on the other hand, provides very clean predictable
throughput without incurring TCP timeouts. Thus, the standard deviation
of CUBIC throughput is dramatically higher than the standard deviation
of DCTCP throughput.

Mean throughput is nearly identical because even though cubic flows
suffer TCP timeouts, other flows will step in and fill the unused
bandwidth. Note that this test is something of a best case scenario
for incast under CUBIC: it allows other flows to fill in for flows
experiencing a timeout. Under situations where the receiver is issuing
requests and then waiting for all flows to complete, flows cannot fill
in for timed out flows and throughput will drop dramatically.

3) Latency (in ms):

            CUBIC            DCTCP
  Mean      4.0088           0.04219
  Median    4.055            0.0395
  Max       4.2              0.085
  Min       3.32             0.028
  Stddev    0.1666           0.01064

Latency for each protocol was computed by running "ping -i 0.2
<receiver>" from a single sender to the receiver during the incast
test. For DCTCP, "ping -Q 0x6 -i 0.2 <receiver>" was used to ensure
that traffic traversed the DCTCP queue and was not dropped when the
queue size was greater than the marking threshold. The summary
statistics above are over all ping metrics measured between the single
sender, receiver pair.

The latency results for this test show a dramatic difference between
CUBIC and DCTCP. CUBIC intentionally overflows the switch buffer
which incurs the maximum queue latency (more buffer memory will lead
to high latency.) DCTCP, on the other hand, deliberately attempts to
keep queue occupancy low. The result is a two orders of magnitude
reduction of latency with DCTCP - even with a switch with relatively
little RAM. Switches with larger amounts of RAM will incur increasing
amounts of latency for CUBIC, but not for DCTCP.

4) Convergence and stability test:

This test measured the time that DCTCP took to fairly redistribute
bandwidth when a new flow commences. It also measured DCTCP's ability
to remain stable at a fair bandwidth distribution. DCTCP is compared
with CUBIC for this test.

At the commencement of this test, a single flow is sending at maximum
rate (near 10 Gbps) to a single receiver. One second after that first
flow commences, a new flow from a distinct server begins sending to
the same receiver as the first flow. After the second flow has sent
data for 10 seconds, the second flow is terminated. The first flow
sends for an additional second. Ideally, the bandwidth would be evenly
shared as soon as the second flow starts, and recover as soon as it
stops.

The results of this test are shown below. Note that the flow bandwidth
for the two flows was measured near the same time, but not
simultaneously.

DCTCP performs nearly perfectly within the measurement limitations
of this test: bandwidth is quickly distributed fairly between the two
flows, remains stable throughout the duration of the test, and
recovers quickly. CUBIC, in contrast, is slow to divide the bandwidth
fairly, and has trouble remaining stable.

  CUBIC                      DCTCP

  Seconds  Flow 1  Flow 2    Seconds  Flow 1  Flow 2
   0       9.93    0          0       9.92    0
   0.5     9.87    0          0.5     9.86    0
   1       8.73    2.25       1       6.46    4.88
   1.5     7.29    2.8        1.5     4.9     4.99
   2       6.96    3.1        2       4.92    4.94
   2.5     6.67    3.34       2.5     4.93    5
   3       6.39    3.57       3       4.92    4.99
   3.5     6.24    3.75       3.5     4.94    4.74
   4       6       3.94       4       5.34    4.71
   4.5     5.88    4.09       4.5     4.99    4.97
   5       5.27    4.98       5       4.83    5.01
   5.5     4.93    5.04       5.5     4.89    4.99
   6       4.9     4.99       6       4.92    5.04
   6.5     4.93    5.1        6.5     4.91    4.97
   7       4.28    5.8        7       4.97    4.97
   7.5     4.62    4.91       7.5     4.99    4.82
   8       5.05    4.45       8       5.16    4.76
   8.5     5.93    4.09       8.5     4.94    4.98
   9       5.73    4.2        9       4.92    5.02
   9.5     5.62    4.32       9.5     4.87    5.03
  10       6.12    3.2       10       4.91    5.01
  10.5     6.91    3.11      10.5     4.87    5.04
  11       8.48    0         11       8.49    4.94
  11.5     9.87    0         11.5     9.9     0

SYN/ACK ECT test:

This test demonstrates the importance of ECT on SYN and SYN-ACK packets
by measuring the connection probability in the presence of competing
flows for a DCTCP connection attempt *without* ECT in the SYN packet.
The test was repeated five times for each number of competing flows.

              Competing Flows  1 |    2 |    4 |    8 |   16
                               ------------------------------
Mean Connection Probability    1 | 0.67 | 0.45 | 0.28 |    0
Median Connection Probability  1 | 0.65 | 0.45 | 0.25 |    0

As the number of competing flows moves beyond 1, the connection
probability drops rapidly.

Enabling DCTCP with this patch requires the following steps:

DCTCP must be running both on the sender and receiver side in your
data center, i.e.:

  sysctl -w net.ipv4.tcp_congestion_control=dctcp

Also, ECN functionality must be enabled on all switches in your
data center for DCTCP to work. The default ECN marking threshold (K)
heuristic on the switch for DCTCP is e.g., 20 packets (30KB) at
1Gbps, and 65 packets (~100KB) at 10Gbps (K > 1/7 * C * RTT, [4]).

In above tests, for each switch port, traffic was segregated into two
queues. For any packet with a DSCP of 0x01 - or equivalently a TOS of
0x04 - the packet was placed into the DCTCP queue. All other packets
were placed into the default drop-tail queue. For the DCTCP queue,
RED/ECN marking was enabled, here, with a marking threshold of 75 KB.
More details however, we refer you to the paper [2] under section 3).

There are no code changes required to applications running in user
space. DCTCP has been implemented in full *isolation* of the rest of
the TCP code as its own congestion control module, so that it can run
without a need to expose code to the core of the TCP stack, and thus
nothing changes for non-DCTCP users.

Changes in the CA framework code are minimal, and DCTCP algorithm
operates on mechanisms that are already available in most Silicon.
The gain (dctcp_shift_g) is currently a fixed constant (1/16) from
the paper, but we leave the option that it can be chosen carefully
to a different value by the user.

In case DCTCP is being used and ECN support on peer site is off,
DCTCP falls back after 3WHS to operate in normal TCP Reno mode.

ss {-4,-6} -t -i diag interface:

  ... dctcp wscale:7,7 rto:203 rtt:2.349/0.026 mss:1448 cwnd:2054
  ssthresh:1102 ce_state 0 alpha 15 ab_ecn 0 ab_tot 735584
  send 10129.2Mbps pacing_rate 20254.1Mbps unacked:1822 retrans:0/15
  reordering:101 rcv_space:29200

  ... dctcp-reno wscale:7,7 rto:201 rtt:0.711/1.327 ato:40 mss:1448
  cwnd:10 ssthresh:1102 fallback_mode send 162.9Mbps pacing_rate
  325.5Mbps rcv_rtt:1.5 rcv_space:29200

More information about DCTCP can be found in [1-4].

  [1] http://simula.stanford.edu/~alizade/Site/DCTCP.html
  [2] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
  [3] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
  [4] http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00

Joint work with Florian Westphal and Glenn Judd.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dctcp.txt |  43 ++++
 include/uapi/linux/inet_diag.h     |  13 +-
 net/ipv4/Kconfig                   |  26 ++-
 net/ipv4/Makefile                  |   1 +
 net/ipv4/tcp_dctcp.c               | 344 +++++++++++++++++++++++++++++
 net/ipv4/tcp_output.c              |   1 +
 6 files changed, 425 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/dctcp.txt
 create mode 100644 net/ipv4/tcp_dctcp.c

diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt
new file mode 100644
index 000000000000..0d5dfbc89ec9
--- /dev/null
+++ b/Documentation/networking/dctcp.txt
@@ -0,0 +1,43 @@
+DCTCP (DataCenter TCP)
+----------------------
+
+DCTCP is an enhancement to the TCP congestion control algorithm for data
+center networks and leverages Explicit Congestion Notification (ECN) in
+the data center network to provide multi-bit feedback to the end hosts.
+
+To enable it on end hosts:
+
+  sysctl -w net.ipv4.tcp_congestion_control=dctcp
+
+All switches in the data center network running DCTCP must support ECN
+marking and be configured for marking when reaching defined switch buffer
+thresholds. The default ECN marking threshold heuristic for DCTCP on
+switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
+but might need further careful tweaking.
+
+For more details, see below documents:
+
+Paper:
+
+The algorithm is further described in detail in the following two
+SIGCOMM/SIGMETRICS papers:
+
+ i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+      "Data Center TCP (DCTCP)", Data Center Networks session
+      Proc. ACM SIGCOMM, New Delhi, 2010.
+    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+    http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
+
+ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+      "Analysis of DCTCP: Stability, Convergence, and Fairness"
+      Proc. ACM SIGMETRICS, San Jose, 2011.
+    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+
+IETF informational draft:
+
+  http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
+
+DCTCP site:
+
+  http://simula.stanford.edu/~alizade/Site/DCTCP.html
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bbde90fa5838..d65c0a09efd3 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -110,10 +110,10 @@ enum {
 	INET_DIAG_TCLASS,
 	INET_DIAG_SKMEMINFO,
 	INET_DIAG_SHUTDOWN,
+	INET_DIAG_DCTCPINFO,
 };
 
-#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
-
+#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
 
 /* INET_DIAG_MEM */
 
@@ -133,5 +133,14 @@ struct tcpvegas_info {
 	__u32	tcpv_minrtt;
 };
 
+/* INET_DIAG_DCTCPINFO */
+
+struct tcp_dctcp_info {
+	__u16	dctcp_enabled;
+	__u16	dctcp_ce_state;
+	__u32	dctcp_alpha;
+	__u32	dctcp_ab_ecn;
+	__u32	dctcp_ab_tot;
+};
 
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 84f710b7472a..69fb37854449 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS
 	For further details see:
 	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
 
+config TCP_CONG_DCTCP
+	tristate "DataCenter TCP (DCTCP)"
+	default n
+	---help---
+	DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+	provide multi-bit feedback to the end hosts. It is designed to provide:
+
+	- High burst tolerance (incast due to partition/aggregate),
+	- Low latency (short flows, queries),
+	- High throughput (continuous data updates, large file transfers) with
+	  commodity, shallow-buffered switches.
+
+	All switches in the data center network running DCTCP must support
+	ECN marking and be configured for marking when reaching defined switch
+	buffer thresholds. The default ECN marking threshold heuristic for
+	DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+	(~100KB) at 10Gbps, but might need further careful tweaking.
+
+	For further details see:
+	  http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -598,9 +619,11 @@ choice
 	config DEFAULT_WESTWOOD
 		bool "Westwood" if TCP_CONG_WESTWOOD=y
 
+	config DEFAULT_DCTCP
+		bool "DCTCP" if TCP_CONG_DCTCP=y
+
 	config DEFAULT_RENO
 		bool "Reno"
-
 endchoice
 
 endif
@@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG
 	default "westwood" if DEFAULT_WESTWOOD
 	default "veno" if DEFAULT_VENO
 	default "reno" if DEFAULT_RENO
+	default "dctcp" if DEFAULT_DCTCP
 	default "cubic"
 
 config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d78d404c596f..d8105787c199 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..b504371af742
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,344 @@
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ *  - High burst tolerance (incast due to partition/aggregate)
+ *  - Low latency (short flows, queries)
+ *  - High throughput (continuous data updates, large file transfers)
+ *    with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ *    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ *      "Data Center TCP (DCTCP)", Data Center Networks session
+ *      Proc. ACM SIGCOMM, New Delhi, 2010.
+ *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ *      "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ *      Proc. ACM SIGMETRICS, San Jose, 2011.
+ *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ *	Daniel Borkmann <dborkman@redhat.com>
+ *	Florian Westphal <fw@strlen.de>
+ *	Glenn Judd <glenn.judd@morganstanley.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+#define DCTCP_MAX_ALPHA	1024U
+
+struct dctcp {
+	u32 acked_bytes_ecn;
+	u32 acked_bytes_total;
+	u32 prior_snd_una;
+	u32 prior_rcv_nxt;
+	u32 dctcp_alpha;
+	u32 next_seq;
+	u32 ce_state;
+	u32 delayed_ack_reserved;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+		 "parameter for clamping alpha on loss");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+	ca->next_seq = tp->snd_nxt;
+
+	ca->acked_bytes_ecn = 0;
+	ca->acked_bytes_total = 0;
+}
+
+static void dctcp_init(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((tp->ecn_flags & TCP_ECN_OK) ||
+	    (sk->sk_state == TCP_LISTEN ||
+	     sk->sk_state == TCP_CLOSE)) {
+		struct dctcp *ca = inet_csk_ca(sk);
+
+		ca->prior_snd_una = tp->snd_una;
+		ca->prior_rcv_nxt = tp->rcv_nxt;
+
+		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+		ca->delayed_ack_reserved = 0;
+		ca->ce_state = 0;
+
+		dctcp_reset(tp, ca);
+		return;
+	}
+
+	/* No ECN support? Fall back to Reno. Also need to clear
+	 * ECT from sk since it is set during 3WHS for DCTCP.
+	 */
+	inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+	INET_ECN_dontxmit(sk);
+}
+
+static u32 dctcp_ssthresh(struct sock *sk)
+{
+	const struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S:	0 <- last pkt was non-CE
+ *	1 <- last pkt was CE
+ */
+
+static void dctcp_ce_state_0_to_1(struct sock *sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* State has changed from CE=0 to CE=1 and delayed
+	 * ACK has not sent yet.
+	 */
+	if (!ca->ce_state && ca->delayed_ack_reserved) {
+		u32 tmp_rcv_nxt;
+
+		/* Save current rcv_nxt. */
+		tmp_rcv_nxt = tp->rcv_nxt;
+
+		/* Generate previous ack with CE=0. */
+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+		tp->rcv_nxt = ca->prior_rcv_nxt;
+
+		tcp_send_ack(sk);
+
+		/* Recover current rcv_nxt. */
+		tp->rcv_nxt = tmp_rcv_nxt;
+	}
+
+	ca->prior_rcv_nxt = tp->rcv_nxt;
+	ca->ce_state = 1;
+
+	tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_ce_state_1_to_0(struct sock *sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* State has changed from CE=1 to CE=0 and delayed
+	 * ACK has not sent yet.
+	 */
+	if (ca->ce_state && ca->delayed_ack_reserved) {
+		u32 tmp_rcv_nxt;
+
+		/* Save current rcv_nxt. */
+		tmp_rcv_nxt = tp->rcv_nxt;
+
+		/* Generate previous ack with CE=1. */
+		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		tp->rcv_nxt = ca->prior_rcv_nxt;
+
+		tcp_send_ack(sk);
+
+		/* Recover current rcv_nxt. */
+		tp->rcv_nxt = tmp_rcv_nxt;
+	}
+
+	ca->prior_rcv_nxt = tp->rcv_nxt;
+	ca->ce_state = 0;
+
+	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct dctcp *ca = inet_csk_ca(sk);
+	u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
+
+	/* If ack did not advance snd_una, count dupack as MSS size.
+	 * If ack did update window, do not count it at all.
+	 */
+	if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
+		acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+	if (acked_bytes) {
+		ca->acked_bytes_total += acked_bytes;
+		ca->prior_snd_una = tp->snd_una;
+
+		if (flags & CA_ACK_ECE)
+			ca->acked_bytes_ecn += acked_bytes;
+	}
+
+	/* Expired RTT */
+	if (!before(tp->snd_una, ca->next_seq)) {
+		/* For avoiding denominator == 1. */
+		if (ca->acked_bytes_total == 0)
+			ca->acked_bytes_total = 1;
+
+		/* alpha = (1 - g) * alpha + g * F */
+		ca->dctcp_alpha = ca->dctcp_alpha -
+				  (ca->dctcp_alpha >> dctcp_shift_g) +
+				  (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
+				  ca->acked_bytes_total;
+
+		if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
+			/* Clamp dctcp_alpha to max. */
+			ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+
+		dctcp_reset(tp, ca);
+	}
+}
+
+static void dctcp_state(struct sock *sk, u8 new_state)
+{
+	if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+		struct dctcp *ca = inet_csk_ca(sk);
+
+		/* If this extension is enabled, we clamp dctcp_alpha to
+		 * max on packet loss; the motivation is that dctcp_alpha
+		 * is an indicator to the extend of congestion and packet
+		 * loss is an indicator of extreme congestion; setting
+		 * this in practice turned out to be beneficial, and
+		 * effectively assumes total congestion which reduces the
+		 * window by half.
+		 */
+		ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+	}
+}
+
+static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+
+	switch (ev) {
+	case CA_EVENT_DELAYED_ACK:
+		if (!ca->delayed_ack_reserved)
+			ca->delayed_ack_reserved = 1;
+		break;
+	case CA_EVENT_NON_DELAYED_ACK:
+		if (ca->delayed_ack_reserved)
+			ca->delayed_ack_reserved = 0;
+		break;
+	default:
+		/* Don't care for the rest. */
+		break;
+	}
+}
+
+static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+	switch (ev) {
+	case CA_EVENT_ECN_IS_CE:
+		dctcp_ce_state_0_to_1(sk);
+		break;
+	case CA_EVENT_ECN_NO_CE:
+		dctcp_ce_state_1_to_0(sk);
+		break;
+	case CA_EVENT_DELAYED_ACK:
+	case CA_EVENT_NON_DELAYED_ACK:
+		dctcp_update_ack_reserved(sk, ev);
+		break;
+	default:
+		/* Don't care for the rest. */
+		break;
+	}
+}
+
+static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+	const struct dctcp *ca = inet_csk_ca(sk);
+
+	/* Fill it also in case of VEGASINFO due to req struct limits.
+	 * We can still correctly retrieve it later.
+	 */
+	if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcp_dctcp_info info;
+
+		memset(&info, 0, sizeof(info));
+		if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+			info.dctcp_enabled = 1;
+			info.dctcp_ce_state = (u16) ca->ce_state;
+			info.dctcp_alpha = ca->dctcp_alpha;
+			info.dctcp_ab_ecn = ca->acked_bytes_ecn;
+			info.dctcp_ab_tot = ca->acked_bytes_total;
+		}
+
+		nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+	}
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+	.init		= dctcp_init,
+	.in_ack_event   = dctcp_update_alpha,
+	.cwnd_event	= dctcp_cwnd_event,
+	.ssthresh	= dctcp_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.set_state	= dctcp_state,
+	.get_info	= dctcp_get_info,
+	.flags		= TCP_CONG_NEEDS_ECN,
+	.owner		= THIS_MODULE,
+	.name		= "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.get_info	= dctcp_get_info,
+	.owner		= THIS_MODULE,
+	.name		= "dctcp-reno",
+};
+
+static int __init dctcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 124f9e4e4594..86a0216fcaa1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3211,6 +3211,7 @@ void tcp_send_ack(struct sock *sk)
 	skb_mstamp_get(&buff->skb_mstamp);
 	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
 }
+EXPORT_SYMBOL_GPL(tcp_send_ack);
 
 /* This routine sends a packet with an out of date sequence
  * number. It assumes the other end will try to ack it.