Merge branch 'for_net-next-5.1/rds-tos-v4' of git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux

Santosh Shilimkar says:

====================
rds: add tos support

RDS applications make use of tos to classify database traffic.
This feature has been used in shipping products from 2.6.32 based
kernels. Its tied with RDS v4.1 protocol version and the compatibility
gets negotiated as part of connections setup.

Patchset keeps full backward compatibility using existing connection
negotiation scheme. Currently the feature is exploited by RDMA
transport and for TCP transport the user tos values are mapped to
same default class (0).

For RDMA transports, RDMA CM service type API is used to
set up different SL(service lanes) and the IB fabric is configured
for tos mapping using Subnet Manager(SL to VL mappings).
Similarly for ROCE fabric, user priority is mapped with different
DSCP code points which are associated with different switch queues
in the fabric.

The original code was developed by Bang Nguyen in downstream kernel back in
2.6.32 kernel days and it has evolved significantly over period of time.

Thanks to Yanjun for doing testing with various combinations of host like
v3.1<->v4.1, v4.1.<->v3.1, v4.1 upstream to shipping v4.1 etc etc
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-02-06 17:00:15 -08:00
commit 042a41977b
16 changed files with 166 additions and 52 deletions

View File

@ -69,6 +69,12 @@
#define RDS_TRANS_COUNT 3 #define RDS_TRANS_COUNT 3
#define RDS_TRANS_NONE (~0) #define RDS_TRANS_NONE (~0)
/* IOCTLS commands for SOL_RDS */
#define SIOCRDSSETTOS (SIOCPROTOPRIVATE)
#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1)
typedef __u8 rds_tos_t;
/* /*
* Control message types for SOL_RDS. * Control message types for SOL_RDS.
* *
@ -149,6 +155,7 @@ struct rds_info_connection {
__be32 faddr; __be32 faddr;
__u8 transport[TRANSNAMSIZ]; /* null term ascii */ __u8 transport[TRANSNAMSIZ]; /* null term ascii */
__u8 flags; __u8 flags;
__u8 tos;
} __attribute__((packed)); } __attribute__((packed));
struct rds6_info_connection { struct rds6_info_connection {
@ -171,6 +178,7 @@ struct rds_info_message {
__be16 lport; __be16 lport;
__be16 fport; __be16 fport;
__u8 flags; __u8 flags;
__u8 tos;
} __attribute__((packed)); } __attribute__((packed));
struct rds6_info_message { struct rds6_info_message {
@ -214,6 +222,7 @@ struct rds_info_tcp_socket {
__u32 last_sent_nxt; __u32 last_sent_nxt;
__u32 last_expected_una; __u32 last_expected_una;
__u32 last_seen_una; __u32 last_seen_una;
__u8 tos;
} __attribute__((packed)); } __attribute__((packed));
struct rds6_info_tcp_socket { struct rds6_info_tcp_socket {
@ -240,6 +249,7 @@ struct rds_info_rdma_connection {
__u32 max_send_sge; __u32 max_send_sge;
__u32 rdma_mr_max; __u32 rdma_mr_max;
__u32 rdma_mr_size; __u32 rdma_mr_size;
__u8 tos;
}; };
struct rds6_info_rdma_connection { struct rds6_info_rdma_connection {
@ -253,6 +263,7 @@ struct rds6_info_rdma_connection {
__u32 max_send_sge; __u32 max_send_sge;
__u32 rdma_mr_max; __u32 rdma_mr_max;
__u32 rdma_mr_size; __u32 rdma_mr_size;
__u8 tos;
}; };
/* RDS message Receive Path Latency points */ /* RDS message Receive Path Latency points */

View File

@ -254,7 +254,40 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{ {
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
rds_tos_t utos, tos = 0;
switch (cmd) {
case SIOCRDSSETTOS:
if (get_user(utos, (rds_tos_t __user *)arg))
return -EFAULT;
if (rs->rs_transport &&
rs->rs_transport->get_tos_map)
tos = rs->rs_transport->get_tos_map(utos);
else
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
spin_lock_bh(&rds_sock_lock);
if (rs->rs_tos || rs->rs_conn) {
spin_unlock_bh(&rds_sock_lock);
return -EINVAL;
}
rs->rs_tos = tos;
spin_unlock_bh(&rds_sock_lock);
break;
case SIOCRDSGETTOS:
spin_lock_bh(&rds_sock_lock);
tos = rs->rs_tos;
spin_unlock_bh(&rds_sock_lock);
if (put_user(tos, (rds_tos_t __user *)arg))
return -EFAULT;
break;
default:
return -ENOIOCTLCMD;
}
return 0;
} }
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
@ -650,6 +683,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
spin_lock_init(&rs->rs_rdma_lock); spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT; rs->rs_rdma_keys = RB_ROOT;
rs->rs_rx_traces = 0; rs->rs_rx_traces = 0;
rs->rs_tos = 0;
rs->rs_conn = NULL;
spin_lock_bh(&rds_sock_lock); spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list); list_add_tail(&rs->rs_item, &rds_sock_list);

View File

@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
const struct in6_addr *laddr, const struct in6_addr *laddr,
const struct in6_addr *faddr, const struct in6_addr *faddr,
struct rds_transport *trans, struct rds_transport *trans,
int dev_if) u8 tos, int dev_if)
{ {
struct rds_connection *conn, *ret = NULL; struct rds_connection *conn, *ret = NULL;
@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
if (ipv6_addr_equal(&conn->c_faddr, faddr) && if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
ipv6_addr_equal(&conn->c_laddr, laddr) && ipv6_addr_equal(&conn->c_laddr, laddr) &&
conn->c_trans == trans && conn->c_trans == trans &&
conn->c_tos == tos &&
net == rds_conn_net(conn) && net == rds_conn_net(conn) &&
conn->c_dev_if == dev_if) { conn->c_dev_if == dev_if) {
ret = conn; ret = conn;
@ -139,6 +140,7 @@ static void __rds_conn_path_init(struct rds_connection *conn,
atomic_set(&cp->cp_state, RDS_CONN_DOWN); atomic_set(&cp->cp_state, RDS_CONN_DOWN);
cp->cp_send_gen = 0; cp->cp_send_gen = 0;
cp->cp_reconnect_jiffies = 0; cp->cp_reconnect_jiffies = 0;
cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
@ -159,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
const struct in6_addr *laddr, const struct in6_addr *laddr,
const struct in6_addr *faddr, const struct in6_addr *faddr,
struct rds_transport *trans, struct rds_transport *trans,
gfp_t gfp, gfp_t gfp, u8 tos,
int is_outgoing, int is_outgoing,
int dev_if) int dev_if)
{ {
@ -171,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
rcu_read_lock(); rcu_read_lock();
conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
if (conn && if (conn &&
conn->c_loopback && conn->c_loopback &&
conn->c_trans != &rds_loop_transport && conn->c_trans != &rds_loop_transport &&
@ -205,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
conn->c_faddr = *faddr; conn->c_faddr = *faddr;
conn->c_dev_if = dev_if; conn->c_dev_if = dev_if;
conn->c_tos = tos;
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
/* If the local address is link local, set c_bound_if to be the /* If the local address is link local, set c_bound_if to be the
@ -297,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *found; struct rds_connection *found;
found = rds_conn_lookup(net, head, laddr, faddr, trans, found = rds_conn_lookup(net, head, laddr, faddr, trans,
dev_if); tos, dev_if);
if (found) { if (found) {
struct rds_conn_path *cp; struct rds_conn_path *cp;
int i; int i;
@ -332,10 +335,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *rds_conn_create(struct net *net, struct rds_connection *rds_conn_create(struct net *net,
const struct in6_addr *laddr, const struct in6_addr *laddr,
const struct in6_addr *faddr, const struct in6_addr *faddr,
struct rds_transport *trans, gfp_t gfp, struct rds_transport *trans, u8 tos,
int dev_if) gfp_t gfp, int dev_if)
{ {
return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
} }
EXPORT_SYMBOL_GPL(rds_conn_create); EXPORT_SYMBOL_GPL(rds_conn_create);
@ -343,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
const struct in6_addr *laddr, const struct in6_addr *laddr,
const struct in6_addr *faddr, const struct in6_addr *faddr,
struct rds_transport *trans, struct rds_transport *trans,
gfp_t gfp, int dev_if) u8 tos, gfp_t gfp, int dev_if)
{ {
return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
} }
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);

View File

@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->src_addr = conn->c_laddr.s6_addr32[3];
iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
iinfo->tos = conn->c_tos;
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@ -514,6 +515,15 @@ void rds_ib_exit(void)
rds_ib_mr_exit(); rds_ib_mr_exit();
} }
static u8 rds_ib_get_tos_map(u8 tos)
{
/* 1:1 user to transport map for RDMA transport.
* In future, if custom map is desired, hook can export
* user configurable map.
*/
return tos;
}
struct rds_transport rds_ib_transport = { struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check, .laddr_check = rds_ib_laddr_check,
.xmit_path_complete = rds_ib_xmit_path_complete, .xmit_path_complete = rds_ib_xmit_path_complete,
@ -536,6 +546,7 @@ struct rds_transport rds_ib_transport = {
.sync_mr = rds_ib_sync_mr, .sync_mr = rds_ib_sync_mr,
.free_mr = rds_ib_free_mr, .free_mr = rds_ib_free_mr,
.flush_mrs = rds_ib_flush_mrs, .flush_mrs = rds_ib_flush_mrs,
.get_tos_map = rds_ib_get_tos_map,
.t_owner = THIS_MODULE, .t_owner = THIS_MODULE,
.t_name = "infiniband", .t_name = "infiniband",
.t_unloading = rds_ib_is_unloading, .t_unloading = rds_ib_is_unloading,

View File

@ -67,7 +67,9 @@ struct rds_ib_conn_priv_cmn {
u8 ricpc_protocol_major; u8 ricpc_protocol_major;
u8 ricpc_protocol_minor; u8 ricpc_protocol_minor;
__be16 ricpc_protocol_minor_mask; /* bitmask */ __be16 ricpc_protocol_minor_mask; /* bitmask */
__be32 ricpc_reserved1; u8 ricpc_dp_toss;
u8 ripc_reserved1;
__be16 ripc_reserved2;
__be64 ricpc_ack_seq; __be64 ricpc_ack_seq;
__be32 ricpc_credit; /* non-zero enables flow ctl */ __be32 ricpc_credit; /* non-zero enables flow ctl */
}; };

View File

@ -133,22 +133,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_set_flow_control(conn, be32_to_cpu(credit)); rds_ib_set_flow_control(conn, be32_to_cpu(credit));
} }
if (conn->c_version < RDS_PROTOCOL(3, 1)) { if (conn->c_version < RDS_PROTOCOL_VERSION) {
if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n", pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
&conn->c_laddr, &conn->c_faddr, &conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version)); RDS_PROTOCOL_MINOR(conn->c_version));
set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags);
rds_conn_destroy(conn); rds_conn_destroy(conn);
return; return;
} else { }
pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", }
pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
ic->i_active_side ? "Active" : "Passive", ic->i_active_side ? "Active" : "Passive",
&conn->c_laddr, &conn->c_faddr, &conn->c_laddr, &conn->c_faddr, conn->c_tos,
RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : ""); ic->i_flowctl ? ", flow control" : "");
}
atomic_set(&ic->i_cq_quiesce, 0); atomic_set(&ic->i_cq_quiesce, 0);
@ -184,6 +185,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
NULL); NULL);
} }
conn->c_proposed_version = conn->c_version;
rds_connect_complete(conn); rds_connect_complete(conn);
} }
@ -220,6 +222,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->ricp_v6.dp_ack_seq = dp->ricp_v6.dp_ack_seq =
cpu_to_be64(rds_ib_piggyb_ack(ic)); cpu_to_be64(rds_ib_piggyb_ack(ic));
dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
conn_param->private_data = &dp->ricp_v6; conn_param->private_data = &dp->ricp_v6;
conn_param->private_data_len = sizeof(dp->ricp_v6); conn_param->private_data_len = sizeof(dp->ricp_v6);
@ -234,6 +237,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->ricp_v4.dp_ack_seq = dp->ricp_v4.dp_ack_seq =
cpu_to_be64(rds_ib_piggyb_ack(ic)); cpu_to_be64(rds_ib_piggyb_ack(ic));
dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
conn_param->private_data = &dp->ricp_v4; conn_param->private_data = &dp->ricp_v4;
conn_param->private_data_len = sizeof(dp->ricp_v4); conn_param->private_data_len = sizeof(dp->ricp_v4);
@ -389,8 +393,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break; break;
default: default:
rdsdebug("Fatal QP Event %u (%s) " rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
"- connection %pI6c->%pI6c, reconnecting\n",
event->event, ib_event_msg(event->event), event->event, ib_event_msg(event->event),
&conn->c_laddr, &conn->c_faddr); &conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn); rds_conn_drop(conn);
@ -660,13 +663,16 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
/* Even if len is crap *now* I still want to check it. -ASG */ /* Even if len is crap *now* I still want to check it. -ASG */
if (event->param.conn.private_data_len < data_len || major == 0) if (event->param.conn.private_data_len < data_len || major == 0)
return RDS_PROTOCOL_3_0; return RDS_PROTOCOL_4_0;
common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
if (major == 3 && common) { if (major == 4 && common) {
version = RDS_PROTOCOL_3_0; version = RDS_PROTOCOL_4_0;
while ((common >>= 1) != 0) while ((common >>= 1) != 0)
version++; version++;
} else if (RDS_PROTOCOL_COMPAT_VERSION ==
RDS_PROTOCOL(major, minor)) {
version = RDS_PROTOCOL_COMPAT_VERSION;
} else { } else {
if (isv6) if (isv6)
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n", printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
@ -729,8 +735,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
/* Check whether the remote protocol version matches ours. */ /* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event, isv6); version = rds_ib_protocol_compatible(event, isv6);
if (!version) if (!version) {
err = RDS_RDMA_REJ_INCOMPAT;
goto out; goto out;
}
dp = event->param.conn.private_data; dp = event->param.conn.private_data;
if (isv6) { if (isv6) {
@ -771,15 +779,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
daddr6 = &d_mapped_addr; daddr6 = &d_mapped_addr;
} }
rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid " rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
"0x%llx\n", saddr6, daddr6, saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), RDS_PROTOCOL_MINOR(version),
(unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid)); (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
/* RDS/IB is not currently netns aware, thus init_net */ /* RDS/IB is not currently netns aware, thus init_net */
conn = rds_conn_create(&init_net, daddr6, saddr6, conn = rds_conn_create(&init_net, daddr6, saddr6,
&rds_ib_transport, GFP_KERNEL, ifindex); &rds_ib_transport, dp_cmn->ricpc_dp_toss,
GFP_KERNEL, ifindex);
if (IS_ERR(conn)) { if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL; conn = NULL;
@ -846,7 +855,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
if (conn) if (conn)
mutex_unlock(&conn->c_cm_lock); mutex_unlock(&conn->c_cm_lock);
if (err) if (err)
rdma_reject(cm_id, NULL, 0); rdma_reject(cm_id, &err, sizeof(int));
return destroy; return destroy;
} }
@ -861,7 +870,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
/* If the peer doesn't do protocol negotiation, we must /* If the peer doesn't do protocol negotiation, we must
* default to RDSv3.0 */ * default to RDSv3.0 */
rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
ret = rds_ib_setup_qp(conn); ret = rds_ib_setup_qp(conn);
@ -870,7 +879,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
goto out; goto out;
} }
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6); UINT_MAX, UINT_MAX, isv6);
ret = rdma_connect(cm_id, &conn_param); ret = rdma_connect(cm_id, &conn_param);
if (ret) if (ret)

View File

@ -986,9 +986,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
} else { } else {
/* We expect errors as the qp is drained during shutdown */ /* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn)) if (rds_conn_up(conn) || rds_conn_connecting(conn))
rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr, &conn->c_laddr, &conn->c_faddr,
wc->status, conn->c_tos, wc->status,
ib_wc_status_msg(wc->status)); ib_wc_status_msg(wc->status));
} }

View File

@ -305,8 +305,9 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
/* We expect errors as the qp is drained during shutdown */ /* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr, wc->status, &conn->c_laddr, &conn->c_faddr,
conn->c_tos, wc->status,
ib_wc_status_msg(wc->status)); ib_wc_status_msg(wc->status));
} }
} }

View File

@ -51,6 +51,8 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
struct rds_connection *conn = cm_id->context; struct rds_connection *conn = cm_id->context;
struct rds_transport *trans; struct rds_transport *trans;
int ret = 0; int ret = 0;
int *err;
u8 len;
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event, rdma_event_msg(event->event)); event->event, rdma_event_msg(event->event));
@ -81,6 +83,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
break; break;
case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ADDR_RESOLVED:
rdma_set_service_type(cm_id, conn->c_tos);
/* XXX do we need to clean up if this fails? */ /* XXX do we need to clean up if this fails? */
ret = rdma_resolve_route(cm_id, ret = rdma_resolve_route(cm_id,
RDS_RDMA_RESOLVE_TIMEOUT_MS); RDS_RDMA_RESOLVE_TIMEOUT_MS);
@ -106,8 +109,19 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
break; break;
case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_REJECTED:
if (!conn)
break;
err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) {
pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
&conn->c_laddr, &conn->c_faddr);
conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
conn->c_tos = 0;
rds_conn_drop(conn);
}
rdsdebug("Connection rejected: %s\n", rdsdebug("Connection rejected: %s\n",
rdma_reject_msg(cm_id, event->status)); rdma_reject_msg(cm_id, event->status));
break;
/* FALLTHROUGH */ /* FALLTHROUGH */
case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR:

View File

@ -11,6 +11,12 @@
#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
/* Below reject reason is for legacy interoperability issue with non-linux
* RDS endpoints where older version incompatibility is conveyed via value 1.
* For future version(s), proper encoded reject reason should be be used.
*/
#define RDS_RDMA_REJ_INCOMPAT 1
int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event); struct rdma_cm_event *event);

View File

@ -19,10 +19,13 @@
*/ */
#define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_0 0x0300
#define RDS_PROTOCOL_3_1 0x0301 #define RDS_PROTOCOL_3_1 0x0301
#define RDS_PROTOCOL_4_0 0x0400
#define RDS_PROTOCOL_4_1 0x0401
#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
#define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL_MINOR(v) ((v) & 255)
#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
#define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1
/* The following ports, 16385, 18634, 18635, are registered with IANA as /* The following ports, 16385, 18634, 18635, are registered with IANA as
* the ports to be used for RDS over TCP and UDP. Currently, only RDS over * the ports to be used for RDS over TCP and UDP. Currently, only RDS over
@ -151,9 +154,13 @@ struct rds_connection {
struct rds_cong_map *c_fcong; struct rds_cong_map *c_fcong;
/* Protocol version */ /* Protocol version */
unsigned int c_proposed_version;
unsigned int c_version; unsigned int c_version;
possible_net_t c_net; possible_net_t c_net;
/* TOS */
u8 c_tos;
struct list_head c_map_item; struct list_head c_map_item;
unsigned long c_map_queued; unsigned long c_map_queued;
@ -567,6 +574,7 @@ struct rds_transport {
void (*free_mr)(void *trans_private, int invalidate); void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void); void (*flush_mrs)(void);
bool (*t_unloading)(struct rds_connection *conn); bool (*t_unloading)(struct rds_connection *conn);
u8 (*get_tos_map)(u8 tos);
}; };
/* Bind hash table key length. It is the sum of the size of a struct /* Bind hash table key length. It is the sum of the size of a struct
@ -648,6 +656,7 @@ struct rds_sock {
u8 rs_rx_traces; u8 rs_rx_traces;
u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
struct rds_msg_zcopy_queue rs_zcookie_queue; struct rds_msg_zcopy_queue rs_zcookie_queue;
u8 rs_tos;
}; };
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@ -756,13 +765,14 @@ void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net, struct rds_connection *rds_conn_create(struct net *net,
const struct in6_addr *laddr, const struct in6_addr *laddr,
const struct in6_addr *faddr, const struct in6_addr *faddr,
struct rds_transport *trans, gfp_t gfp, struct rds_transport *trans,
u8 tos, gfp_t gfp,
int dev_if); int dev_if);
struct rds_connection *rds_conn_create_outgoing(struct net *net, struct rds_connection *rds_conn_create_outgoing(struct net *net,
const struct in6_addr *laddr, const struct in6_addr *laddr,
const struct in6_addr *faddr, const struct in6_addr *faddr,
struct rds_transport *trans, struct rds_transport *trans,
gfp_t gfp, int dev_if); u8 tos, gfp_t gfp, int dev_if);
void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn);

View File

@ -782,6 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
minfo.len = be32_to_cpu(inc->i_hdr.h_len); minfo.len = be32_to_cpu(inc->i_hdr.h_len);
minfo.tos = inc->i_conn->c_tos;
if (flip) { if (flip) {
minfo.laddr = daddr; minfo.laddr = daddr;

View File

@ -1277,12 +1277,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* rds_conn_create has a spinlock that runs with IRQ off. /* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */ * Caching the conn in the socket helps a lot. */
if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) &&
rs->rs_tos == rs->rs_conn->c_tos) {
conn = rs->rs_conn; conn = rs->rs_conn;
else { } else {
conn = rds_conn_create_outgoing(sock_net(sock->sk), conn = rds_conn_create_outgoing(sock_net(sock->sk),
&rs->rs_bound_addr, &daddr, &rs->rs_bound_addr, &daddr,
rs->rs_transport, rs->rs_transport, rs->rs_tos,
sock->sk->sk_allocation, sock->sk->sk_allocation,
scope_id); scope_id);
if (IS_ERR(conn)) { if (IS_ERR(conn)) {

View File

@ -267,6 +267,7 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
tsinfo.last_sent_nxt = tc->t_last_sent_nxt; tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
tsinfo.last_expected_una = tc->t_last_expected_una; tsinfo.last_expected_una = tc->t_last_expected_una;
tsinfo.last_seen_una = tc->t_last_seen_una; tsinfo.last_seen_una = tc->t_last_seen_una;
tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
} }
@ -452,6 +453,12 @@ static void rds_tcp_destroy_conns(void)
static void rds_tcp_exit(void); static void rds_tcp_exit(void);
static u8 rds_tcp_get_tos_map(u8 tos)
{
/* all user tos mapped to default 0 for TCP transport */
return 0;
}
struct rds_transport rds_tcp_transport = { struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check, .laddr_check = rds_tcp_laddr_check,
.xmit_path_prepare = rds_tcp_xmit_path_prepare, .xmit_path_prepare = rds_tcp_xmit_path_prepare,
@ -466,6 +473,7 @@ struct rds_transport rds_tcp_transport = {
.inc_free = rds_tcp_inc_free, .inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy, .stats_info_copy = rds_tcp_stats_info_copy,
.exit = rds_tcp_exit, .exit = rds_tcp_exit,
.get_tos_map = rds_tcp_get_tos_map,
.t_owner = THIS_MODULE, .t_owner = THIS_MODULE,
.t_name = "tcp", .t_name = "tcp",
.t_type = RDS_TRANS_TCP, .t_type = RDS_TRANS_TCP,

View File

@ -200,7 +200,7 @@ int rds_tcp_accept_one(struct socket *sock)
conn = rds_conn_create(sock_net(sock->sk), conn = rds_conn_create(sock_net(sock->sk),
my_addr, peer_addr, my_addr, peer_addr,
&rds_tcp_transport, GFP_KERNEL, dev_if); &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
if (IS_ERR(conn)) { if (IS_ERR(conn)) {
ret = PTR_ERR(conn); ret = PTR_ERR(conn);

View File

@ -93,6 +93,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
} }
rcu_read_unlock(); rcu_read_unlock();
cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
} }
EXPORT_SYMBOL_GPL(rds_connect_path_complete); EXPORT_SYMBOL_GPL(rds_connect_path_complete);