From 1e2b44e78eead7bcadfbf96f70d95773191541c9 Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Mon, 23 Jul 2018 20:51:22 -0700 Subject: [PATCH] rds: Enable RDS IPv6 support This patch enables RDS to use IPv6 addresses. For RDS/TCP, the listener is now an IPv6 endpoint which accepts both IPv4 and IPv6 connection requests. RDS/RDMA/IB uses a private data (struct rds_ib_connect_private) exchange between endpoints at RDS connection establishment time to support RDMA. This private data exchange uses a 32 bit integer to represent an IP address. This needs to be changed in order to support IPv6. A new private data struct rds6_ib_connect_private is introduced to handle this. To ensure backward compatibility, an IPv6 capable RDS stack uses another RDMA listener port (RDS_CM_PORT) to accept IPv6 connection. And it continues to use the original RDS_PORT for IPv4 RDS connections. When it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to send the connection set up request. v5: Fixed syntax problem (David Miller). v4: Changed port history comments in rds.h (Sowmini Varadhan). v3: Added support to set up IPv4 connection using mapped address (David Miller). Added support to set up connection between link local and non-link addresses. Various review comments from Santosh Shilimkar and Sowmini Varadhan. v2: Fixed bound and peer address scope mismatched issue. Added back rds_connect() IPv6 changes. Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/af_rds.c | 91 +++++++++++++++++++++++++++++++++------- net/rds/bind.c | 59 ++++++++++++++++++++++---- net/rds/connection.c | 54 +++++++++++++++++------- net/rds/ib.c | 55 ++++++++++++++++++++---- net/rds/ib_cm.c | 20 ++++++--- net/rds/rdma_transport.c | 30 ++++++++++++- net/rds/rdma_transport.h | 5 +++ net/rds/rds.h | 22 ++++++---- net/rds/recv.c | 2 +- net/rds/send.c | 61 +++++++++++++++++++++++---- net/rds/tcp.c | 54 +++++++++++++++--------- net/rds/tcp.h | 2 +- net/rds/tcp_connect.c | 54 ++++++++++++++++++------ net/rds/tcp_listen.c | 64 ++++++++++++++++++++++------ 14 files changed, 459 insertions(+), 114 deletions(-) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index fc1a5c63b783..fc5c48b248fe 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -142,15 +142,32 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, uaddr_len = sizeof(*sin6); } } else { - /* If socket is not yet bound, set the return address family - * to be AF_UNSPEC (value 0) and the address size to be that - * of an IPv4 address. + /* If socket is not yet bound and the socket is connected, + * set the return address family to be the same as the + * connected address, but with 0 address value. If it is not + * connected, set the family to be AF_UNSPEC (value 0) and + * the address size to be that of an IPv4 address. */ if (ipv6_addr_any(&rs->rs_bound_addr)) { - sin = (struct sockaddr_in *)uaddr; - memset(sin, 0, sizeof(*sin)); - sin->sin_family = AF_UNSPEC; - return sizeof(*sin); + if (ipv6_addr_any(&rs->rs_conn_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_UNSPEC; + return sizeof(*sin); + } + + if (ipv6_addr_type(&rs->rs_conn_addr) & + IPV6_ADDR_MAPPED) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + return sizeof(*sin); + } + + sin6 = (struct sockaddr_in6 *)uaddr; + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + return sizeof(*sin6); } if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { sin = (struct sockaddr_in *)uaddr; @@ -484,16 +501,18 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; struct rds_sock *rs = rds_sk_to_rs(sk); + int addr_type; int ret = 0; lock_sock(sk); - switch (addr_len) { - case sizeof(struct sockaddr_in): + switch (uaddr->sa_family) { + case AF_INET: sin = (struct sockaddr_in *)uaddr; - if (sin->sin_family != AF_INET) { - ret = -EAFNOSUPPORT; + if (addr_len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; break; } if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { @@ -509,12 +528,56 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, rs->rs_conn_port = sin->sin_port; break; - case sizeof(struct sockaddr_in6): - ret = -EPROTONOSUPPORT; + case AF_INET6: + sin6 = (struct sockaddr_in6 *)uaddr; + if (addr_len < sizeof(struct sockaddr_in6)) { + ret = -EINVAL; + break; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) { + ret = -EPROTOTYPE; + break; + } + + /* It is a mapped address. Need to do some sanity + * checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) { + ret = -EPROTOTYPE; + break; + } + } + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + /* If socket is arleady bound to a link local address, + * the peer address must be on the same link. + */ + if (sin6->sin6_scope_id == 0 || + (!ipv6_addr_any(&rs->rs_bound_addr) && + rs->rs_bound_scope_id && + sin6->sin6_scope_id != rs->rs_bound_scope_id)) { + ret = -EINVAL; + break; + } + /* Remember the connected address scope ID. It will + * be checked against the binding local address when + * the socket is bound. + */ + rs->rs_bound_scope_id = sin6->sin6_scope_id; + } + rs->rs_conn_addr = sin6->sin6_addr; + rs->rs_conn_port = sin6->sin6_port; break; default: - ret = -EINVAL; + ret = -EAFNOSUPPORT; break; } diff --git a/net/rds/bind.c b/net/rds/bind.c index c401776ad938..ba778760cbc2 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -127,9 +127,10 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; + rs->rs_bound_scope_id = scope_id; ret = 0; - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); + rdsdebug("rs %p binding to %pI6c:%d\n", + rs, addr, (int)ntohs(*port)); break; } else { rs->rs_bound_addr = in6addr_any; @@ -164,23 +165,53 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; + int addr_type; int ret = 0; __be16 port; - /* We only allow an RDS socket to be bound to an IPv4 address. IPv6 - * address support will be added later. + /* We allow an RDS socket to be bound to either IPv4 or IPv6 + * address. */ - if (addr_len == sizeof(struct sockaddr_in)) { + if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; - if (sin->sin_family != AF_INET || - sin->sin_addr.s_addr == htonl(INADDR_ANY)) + if (addr_len < sizeof(struct sockaddr_in) || + sin->sin_addr.s_addr == htonl(INADDR_ANY) || + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; - } else if (addr_len == sizeof(struct sockaddr_in6)) { - return -EPROTONOSUPPORT; + } else if (uaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; + + if (addr_len < sizeof(struct sockaddr_in6)) + return -EINVAL; + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) + return -EINVAL; + + /* It is a mapped address. Need to do some sanity + * checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) + return -EINVAL; + } + /* The scope ID must be specified for link local address. */ + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) + return -EINVAL; + scope_id = sin6->sin6_scope_id; + } + binding_addr = &sin6->sin6_addr; + port = sin6->sin6_port; } else { return -EINVAL; } @@ -191,6 +222,16 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ret = -EINVAL; goto out; } + /* Socket is connected. The binding address should have the same + * scope ID as the connected address, except the case when one is + * non-link local address (scope_id is 0). + */ + if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && + rs->rs_bound_scope_id && + scope_id != rs->rs_bound_scope_id) { + ret = -EINVAL; + goto out; + } ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) diff --git a/net/rds/connection.c b/net/rds/connection.c index 3176ead0ab4d..5c9ceed55dae 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,6 +36,7 @@ #include #include #include +#include #include "rds.h" #include "loop.h" @@ -200,6 +201,15 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; + /* If the local address is link local, set c_bound_if to be the + * index used for this connection. Otherwise, set it to 0 as + * the socket is not bound to an interface. c_bound_if is used + * to look up a socket when a packet is received + */ + if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) + conn->c_bound_if = dev_if; + else + conn->c_bound_if = 0; rds_conn_net_set(conn, net); @@ -486,10 +496,18 @@ void rds_conn_destroy(struct rds_connection *conn) } EXPORT_SYMBOL_GPL(rds_conn_destroy); -static void rds_conn_message_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int want_send) +static void __rds_inc_msg_cp(struct rds_incoming *inc, + struct rds_info_iterator *iter, + void *saddr, void *daddr, int flip) +{ + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, + *(__be32 *)daddr, flip); +} + +static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) { struct hlist_head *head; struct list_head *list; @@ -524,18 +542,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { - __be32 laddr; - __be32 faddr; - total++; - laddr = conn->c_laddr.s6_addr32[3]; - faddr = conn->c_faddr.s6_addr32[3]; if (total <= len) - rds_inc_info_copy(&rm->m_inc, - iter, - laddr, - faddr, - 0); + __rds_inc_msg_cp(&rm->m_inc, + iter, + &conn->c_laddr, + &conn->c_faddr, + 0); } spin_unlock_irqrestore(&cp->cp_lock, flags); @@ -548,6 +561,14 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, lens->each = sizeof(struct rds_info_message); } +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send); +} + static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -655,6 +676,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; + if (conn->c_isv6) + return 0; + cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; diff --git a/net/rds/ib.c b/net/rds/ib.c index c712a848957d..756225c5540f 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +39,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "rds.h" @@ -295,6 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; + if (conn->c_isv6) + return 0; iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; @@ -330,7 +333,6 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } - /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. @@ -346,8 +348,12 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, { int ret; struct rdma_cm_id *cm_id; + struct sockaddr_in6 sin6; struct sockaddr_in sin; + struct sockaddr *sa; + bool isv4; + isv4 = ipv6_addr_v4mapped(addr); /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ @@ -356,20 +362,53 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr->s6_addr32[3]; + if (isv4) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr->s6_addr32[3]; + sa = (struct sockaddr *)&sin; + } else { + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = *addr; + sin6.sin6_scope_id = scope_id; + sa = (struct sockaddr *)&sin6; + + /* XXX Do a special IPv6 link local address check here. The + * reason is that rdma_bind_addr() always succeeds with IPv6 + * link local address regardless it is indeed configured in a + * system. + */ + if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { + struct net_device *dev; + + if (scope_id == 0) + return -EADDRNOTAVAIL; + + /* Use init_net for now as RDS is not network + * name space aware. + */ + dev = dev_get_by_index(&init_net, scope_id); + if (!dev) + return -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + dev_put(dev); + } + } /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI6c ret %d node type %d\n", - addr, ret, + rdsdebug("addr %pI6c%%%u ret %d node type %d\n", + addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index dd8a867e5a9c..a33b82dc0804 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -678,7 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) return version; } -/* Given an IPv6 address, find the IB net_device which hosts that address and +/* Given an IPv6 address, find the net_device which hosts that address and * return its index. This is used by the rds_ib_cm_handle_connect() code to * find the interface index of where an incoming request comes from when * the request is using a link local address. @@ -695,8 +695,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_INFINIBAND && - ipv6_chk_addr(net, addr, dev, 0)) { + if (ipv6_chk_addr(net, addr, dev, 1)) { idx = dev->ifindex; break; } @@ -736,7 +735,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, dp_cmn = &dp->ricp_v6.dp_cmn; saddr6 = &dp->ricp_v6.dp_saddr; daddr6 = &dp->ricp_v6.dp_daddr; - /* If the local address is link local, need to find the + /* If either address is link local, need to find the * interface index in order to create a proper RDS * connection. */ @@ -748,6 +747,14 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, err = -EOPNOTSUPP; goto out; } + } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) { + /* Use our address to find the correct index. */ + ifindex = __rds_find_ifindex(&init_net, daddr6); + /* No index found... Need to bail out. */ + if (ifindex == 0) { + err = -EOPNOTSUPP; + goto out; + } } } else { dp_cmn = &dp->ricp_v4.dp_cmn; @@ -886,7 +893,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - handler = rds_rdma_cm_event_handler; + if (conn->c_isv6) + handler = rds6_rdma_cm_event_handler; + else + handler = rds_rdma_cm_event_handler; ic->i_cm_id = rdma_create_id(&init_net, handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index f49abef69550..bd67e55354f4 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -37,7 +37,9 @@ #include "rdma_transport.h" #include "ib.h" +/* Global IPv4 and IPv6 RDS RDMA listener cm_id */ static struct rdma_cm_id *rds_rdma_listen_id; +static struct rdma_cm_id *rds6_rdma_listen_id; static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, @@ -153,6 +155,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, return rds_rdma_cm_event_handler_cmn(cm_id, event, false); } +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, true); +} + static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, struct sockaddr *sa, struct rdma_cm_id **ret_cm_id) @@ -206,6 +214,7 @@ static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, static int rds_rdma_listen_init(void) { int ret; + struct sockaddr_in6 sin6; struct sockaddr_in sin; sin.sin_family = PF_INET; @@ -214,7 +223,21 @@ static int rds_rdma_listen_init(void) ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, (struct sockaddr *)&sin, &rds_rdma_listen_id); - return ret; + if (ret != 0) + return ret; + + sin6.sin6_family = PF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(RDS_CM_PORT); + sin6.sin6_scope_id = 0; + sin6.sin6_flowinfo = 0; + ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler, + (struct sockaddr *)&sin6, + &rds6_rdma_listen_id); + /* Keep going even when IPv6 is not enabled in the system. */ + if (ret != 0) + rdsdebug("Cannot set up IPv6 RDMA listener\n"); + return 0; } static void rds_rdma_listen_stop(void) @@ -224,6 +247,11 @@ static void rds_rdma_listen_stop(void) rdma_destroy_id(rds_rdma_listen_id); rds_rdma_listen_id = NULL; } + if (rds6_rdma_listen_id) { + rdsdebug("cm %p\n", rds6_rdma_listen_id); + rdma_destroy_id(rds6_rdma_listen_id); + rds6_rdma_listen_id = NULL; + } } static int rds_rdma_init(void) diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index d309c4430124..200d3134aaae 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -6,11 +6,16 @@ #include #include "rds.h" +/* RDMA_CM also uses 16385 as the listener port. */ +#define RDS_CM_PORT 16385 + #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); /* from ib.c */ extern struct rds_transport rds_ib_transport; diff --git a/net/rds/rds.h b/net/rds/rds.h index 1bff26988a5e..ff537bb11411 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -24,14 +24,15 @@ #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) -/* - * XXX randomly chosen, but at least seems to be unused: - * # 18464-18768 Unassigned - * We should do better. We want a reserved port to discourage unpriv'ed - * userspace from listening. +/* The following ports, 16385, 18634, 18635, are registered with IANA as + * the ports to be used for RDS over TCP and UDP. Currently, only RDS over + * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value + * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After + * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept + * to ensure compatibility with older RDS modules. Those ports are defined + * in each transport's header file. */ #define RDS_PORT 18634 -#define RDS_CM_PORT 16385 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 @@ -140,7 +141,8 @@ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; - int c_dev_if; /* c_laddrs's interface index */ + int c_dev_if; /* ifindex used for this conn */ + int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, @@ -736,7 +738,7 @@ void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); -/* conn.c */ +/* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); @@ -874,6 +876,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); diff --git a/net/rds/recv.c b/net/rds/recv.c index 4217961fd130..1402c21210b1 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -364,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, goto out; } - rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; diff --git a/net/rds/send.c b/net/rds/send.c index 6ed2e925c36a..9604e1faa564 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1091,10 +1091,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) ret = -EINVAL; goto out; } - switch (namelen) { - case sizeof(*usin): - if (usin->sin_family != AF_INET || - usin->sin_addr.s_addr == htonl(INADDR_ANY) || + switch (usin->sin_family) { + case AF_INET: + if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { ret = -EINVAL; @@ -1104,9 +1103,44 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) dport = usin->sin_port; break; - case sizeof(*sin6): { - ret = -EPROTONOSUPPORT; - goto out; + case AF_INET6: { + int addr_type; + + if (namelen < sizeof(*sin6)) { + ret = -EINVAL; + goto out; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) { + ret = -EINVAL; + goto out; + } + + /* It is a mapped address. Need to do some + * sanity checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) { + return -EINVAL; + goto out; + } + } + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) { + ret = -EINVAL; + goto out; + } + scope_id = sin6->sin6_scope_id; + } + + daddr = sin6->sin6_addr; + dport = sin6->sin6_port; + break; } default: @@ -1138,6 +1172,19 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) ret = -EOPNOTSUPP; goto out; } + /* If the socket is already bound to a link local address, + * it can only send to peers on the same link. But allow + * communicating beween link local and non-link local address. + */ + if (scope_id != rs->rs_bound_scope_id) { + if (!scope_id) { + scope_id = rs->rs_bound_scope_id; + } else if (rs->rs_bound_scope_id) { + release_sock(sk); + ret = -EINVAL; + goto out; + } + } } release_sock(sk); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index dadb33790333..890d0e1d8908 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,7 +46,12 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); + +/* rds_tcp_tc_count counts only IPv4 connections. + * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. + */ static unsigned int rds_tcp_tc_count; +static unsigned int rds6_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -113,7 +118,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_del_init(&tc->t_list_item); - rds_tcp_tc_count--; + rds6_tcp_tc_count--; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count--; spin_unlock(&rds_tcp_tc_list_lock); tc->t_sock = NULL; @@ -200,7 +207,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); - rds_tcp_tc_count++; + rds6_tcp_tc_count++; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count++; spin_unlock(&rds_tcp_tc_list_lock); /* accepted sockets need our listen data ready undone */ @@ -221,6 +230,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) write_unlock_bh(&sock->sk->sk_callback_lock); } +/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4 + * connections for backward compatibility. + */ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -228,8 +240,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_tcp_socket tsinfo; struct rds_tcp_connection *tc; unsigned long flags; - struct sockaddr_in sin; - struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -237,16 +247,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, goto out; list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct inet_sock *inet = inet_sk(tc->t_sock->sk); - sock = tc->t_sock; - if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, 0); - tsinfo.local_addr = sin.sin_addr.s_addr; - tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, 1); - tsinfo.peer_addr = sin.sin_addr.s_addr; - tsinfo.peer_port = sin.sin_port; - } + if (tc->t_cpath->cp_conn->c_isv6) + continue; + + tsinfo.local_addr = inet->inet_saddr; + tsinfo.local_port = inet->inet_sport; + tsinfo.peer_addr = inet->inet_daddr; + tsinfo.peer_port = inet->inet_dport; tsinfo.hdr_rem = tc->t_tinc_hdr_rem; tsinfo.data_rem = tc->t_tinc_data_rem; @@ -494,13 +503,18 @@ static __net_init int rds_tcp_init_net(struct net *net) err = -ENOMEM; goto fail; } - rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); if (!rtn->rds_tcp_listen_sock) { - pr_warn("could not set up listen sock\n"); - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - rtn->rds_tcp_sysctl = NULL; - err = -EAFNOSUPPORT; - goto fail; + pr_warn("could not set up IPv6 listen sock\n"); + + /* Try IPv4 as some systems disable IPv6 */ + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); + if (!rtn->rds_tcp_listen_sock) { + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; + } } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index c6fa080e9b6d..3c69361d21c7 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -struct socket *rds_tcp_listen_init(struct net *); +struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 231ae927858e..008f50fb25dd 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -89,9 +89,11 @@ void rds_tcp_state_change(struct sock *sk) int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; + struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; int addrlen; + bool isv6; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -108,18 +110,36 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) mutex_unlock(&tc->t_conn_path_lock); return 0; } - ret = sock_create_kern(rds_conn_net(conn), PF_INET, - SOCK_STREAM, IPPROTO_TCP, &sock); + if (ipv6_addr_v4mapped(&conn->c_laddr)) { + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = false; + } else { + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = true; + } + if (ret < 0) goto out; rds_tcp_tune(sock); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; - sin.sin_port = 0; - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_laddr; + sin6.sin6_port = 0; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin.sin_port = 0; + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } ret = sock->ops->bind(sock, addr, addrlen); if (ret) { @@ -128,11 +148,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) goto out; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; - sin.sin_port = htons(RDS_TCP_PORT); - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_faddr; + sin6.sin6_port = htons(RDS_TCP_PORT); + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin.sin_port = htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } /* * once we call connect() we can start getting callbacks and they diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 4fdf5b3a47df..0cf0147117d8 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -131,6 +131,8 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; + struct in6_addr *my_addr, *peer_addr; + int dev_if; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; @@ -163,15 +165,29 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); + my_addr = &new_sock->sk->sk_v6_rcv_saddr; + peer_addr = &new_sock->sk->sk_v6_daddr; rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", - &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), - &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); + my_addr, ntohs(inet->inet_sport), + peer_addr, ntohs(inet->inet_dport)); + /* sk_bound_dev_if is not set if the peer address is not link local + * address. In this case, it happens that mcast_oif is set. So + * just use it. + */ + if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) && + !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) { + struct ipv6_pinfo *inet6; + + inet6 = inet6_sk(new_sock->sk); + dev_if = inet6->mcast_oif; + } else { + dev_if = new_sock->sk->sk_bound_dev_if; + } conn = rds_conn_create(sock_net(sock->sk), &new_sock->sk->sk_v6_rcv_saddr, &new_sock->sk->sk_v6_daddr, - &rds_tcp_transport, GFP_KERNEL, - new_sock->sk->sk_bound_dev_if); + &rds_tcp_transport, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); @@ -256,15 +272,22 @@ void rds_tcp_listen_data_ready(struct sock *sk) ready(sk); } -struct socket *rds_tcp_listen_init(struct net *net) +struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { - struct sockaddr_in sin; struct socket *sock = NULL; + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int addr_len; int ret; - ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) + ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret < 0) { + rdsdebug("could not create %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } sock->sk->sk_reuse = SK_CAN_REUSE; rds_tcp_nonagle(sock); @@ -274,13 +297,28 @@ struct socket *rds_tcp_listen_init(struct net *net) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6 = (struct sockaddr_in6 *)&ss; + sin6->sin6_family = PF_INET6; + sin6->sin6_addr = in6addr_any; + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_scope_id = 0; + sin6->sin6_flowinfo = 0; + addr_len = sizeof(*sin6); + } else { + sin = (struct sockaddr_in *)&ss; + sin->sin_family = PF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + sin->sin_port = (__force u16)htons(RDS_TCP_PORT); + addr_len = sizeof(*sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) + ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + if (ret < 0) { + rdsdebug("could not bind %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } ret = sock->ops->listen(sock, 64); if (ret < 0)