From f711a6ae062caeee46067b2f2f12ffda319ae73c Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 5 May 2015 15:20:51 -0400 Subject: [PATCH 1/2] net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection. When running RDS over TCP, the active (client) side connects to the listening ("passive") side at the RDS_TCP_PORT. After the connection is established, if the client side reboots (potentially without even sending a FIN) the server still has a TCP socket in the esablished state. If the server-side now gets a new SYN comes from the client with a different client port, TCP will create a new socket-pair, but the RDS layer will incorrectly pull up the old rds_connection (which is still associated with the stale t_sock and RDS socket state). This patch corrects this behavior by having rds_tcp_accept_one() always create a new connection for an incoming TCP SYN. The rds and tcp state associated with the old socket-pair is cleaned up via the rds_tcp_state_change() callback which would typically be invoked in most cases when the client-TCP sends a FIN on TCP restart, triggering a transition to CLOSE_WAIT state. In the rarer event of client death without a FIN, TCP_KEEPALIVE probes on the socket will detect the stale socket, and the TCP transition to CLOSE state will trigger the RDS state cleanup. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 4 ++++ net/rds/tcp_connect.c | 1 + net/rds/tcp_listen.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/net/rds/connection.c b/net/rds/connection.c index 14f041398ca1..60f0cd6ed15f 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *loop_trans; unsigned long flags; int ret; + struct rds_transport *otrans = trans; + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + goto new_conn; rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; +new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f9f564a6c960..973109c7b8e8 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: rds_connect_complete(conn); break; + case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_drop(conn); default: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 23ab4dcd1d9f..0da49e34495f 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work); static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); static struct socket *rds_tcp_listen_sock; +static int rds_tcp_keepalive(struct socket *sock) +{ + /* values below based on xs_udp_default_timeout */ + int keepidle = 5; /* send a probe 'keepidle' secs after last data */ + int keepcnt = 5; /* number of unack'ed probes before declaring dead */ + int keepalive = 1; + int ret = 0; + + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + if (ret < 0) + goto bail; + + /* KEEPINTVL is the interval between successive probes. We follow + * the model in xs_tcp_finish_connecting() and re-use keepidle. + */ + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); +bail: + return ret; +} + static int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; + struct rds_tcp_connection *rs_tcp; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); @@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock) if (ret < 0) goto out; + ret = rds_tcp_keepalive(new_sock); + if (ret < 0) + goto out; + rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock) ret = PTR_ERR(conn); goto out; } + /* An incoming SYN request came in, and TCP just accepted it. + * We always create a new conn for listen side of TCP, and do not + * add it to the c_hash_list. + * + * If the client reboots, this conn will need to be cleaned up. + * rds_tcp_state_change() will do that cleanup + */ + rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; + WARN_ON(!rs_tcp || rs_tcp->t_sock); /* * see the comment above rds_queue_delayed_reconnect() From c82ac7e69efe6dbe370d6ba84e2666d7692ef1c2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 5 May 2015 15:20:52 -0400 Subject: [PATCH 2/2] net/rds: RDS-TCP: only initiate reconnect attempt on outgoing TCP socket. When the peer of an RDS-TCP connection restarts, a reconnect attempt should only be made from the active side of the TCP connection, i.e. the side that has a transient TCP port number. Do not add the passive side of the TCP connection to the c_hash_node and thus avoid triggering rds_queue_reconnect() for passive rds connections. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index 60f0cd6ed15f..da6da57e5f36 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -234,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(head, laddr, faddr, trans); + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + found = NULL; + else + found = rds_conn_lookup(head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head_rcu(&conn->c_hash_node, head); + if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || + (otrans->t_type != RDS_TRANS_TCP)) { + /* Only the active side should be added to + * reconnect list for TCP. + */ + hlist_add_head_rcu(&conn->c_hash_node, head); + } rds_cong_add_conn(conn); rds_conn_count++; }