Merge branch 'RDS-optimized-notification-for-zerocopy-completion'
Sowmini Varadhan says: ==================== RDS: optimized notification for zerocopy completion Resending with acked-by additions: previous attempt does not show up in Patchwork. This time with a new mail Message-Id. RDS applications use predominantly request-response, transacation based IPC, so that ingress and egress traffic are well-balanced, and it is possible/desirable to reduce system-call overhead by piggybacking the notifications for zerocopy completion response with data. Moreover, it has been pointed out that socket functions block if sk_err is non-zero, thus if the RDS code does not plan/need to use sk_error_queue path for completion notification, it is preferable to remove the sk_errror_queue related paths in RDS. Both of these goals are implemented in this series. v2: removed sk_error_queue support v3: incorporated additional code review comments (details in each patch) ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
f4155eff1f
|
@ -20,13 +20,11 @@ struct sock_extended_err {
|
|||
#define SO_EE_ORIGIN_ICMP6 3
|
||||
#define SO_EE_ORIGIN_TXSTATUS 4
|
||||
#define SO_EE_ORIGIN_ZEROCOPY 5
|
||||
#define SO_EE_ORIGIN_ZCOOKIE 6
|
||||
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
|
||||
|
||||
#define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
|
||||
|
||||
#define SO_EE_CODE_ZEROCOPY_COPIED 1
|
||||
#define SO_EE_ORIGIN_MAX_ZCOOKIES 8
|
||||
|
||||
/**
|
||||
* struct scm_timestamping - timestamps exposed through cmsg
|
||||
|
|
|
@ -104,6 +104,7 @@
|
|||
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
|
||||
#define RDS_CMSG_RXPATH_LATENCY 11
|
||||
#define RDS_CMSG_ZCOPY_COOKIE 12
|
||||
#define RDS_CMSG_ZCOPY_COMPLETION 13
|
||||
|
||||
#define RDS_INFO_FIRST 10000
|
||||
#define RDS_INFO_COUNTERS 10000
|
||||
|
@ -317,6 +318,12 @@ struct rds_rdma_notify {
|
|||
#define RDS_RDMA_DROPPED 3
|
||||
#define RDS_RDMA_OTHER_ERROR 4
|
||||
|
||||
#define RDS_MAX_ZCOOKIES 8
|
||||
struct rds_zcopy_cookies {
|
||||
__u32 num;
|
||||
__u32 cookies[RDS_MAX_ZCOOKIES];
|
||||
};
|
||||
|
||||
/*
|
||||
* Common set of flags for all RDMA related structs
|
||||
*/
|
||||
|
|
|
@ -77,6 +77,7 @@ static int rds_release(struct socket *sock)
|
|||
rds_send_drop_to(rs, NULL);
|
||||
rds_rdma_drop_keys(rs);
|
||||
rds_notify_queue_get(rs, NULL);
|
||||
__skb_queue_purge(&rs->rs_zcookie_queue);
|
||||
|
||||
spin_lock_bh(&rds_sock_lock);
|
||||
list_del_init(&rs->rs_item);
|
||||
|
@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
|
|||
* - to signal that a previously congested destination may have become
|
||||
* uncongested
|
||||
* - A notification has been queued to the socket (this can be a congestion
|
||||
* update, or a RDMA completion).
|
||||
* update, or a RDMA completion, or a MSG_ZEROCOPY completion).
|
||||
*
|
||||
* EPOLLOUT is asserted if there is room on the send queue. This does not mean
|
||||
* however, that the next sendmsg() call will succeed. If the application tries
|
||||
|
@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
|
|||
spin_unlock(&rs->rs_lock);
|
||||
}
|
||||
if (!list_empty(&rs->rs_recv_queue) ||
|
||||
!list_empty(&rs->rs_notify_queue))
|
||||
!list_empty(&rs->rs_notify_queue) ||
|
||||
!skb_queue_empty(&rs->rs_zcookie_queue))
|
||||
mask |= (EPOLLIN | EPOLLRDNORM);
|
||||
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
|
||||
mask |= (EPOLLOUT | EPOLLWRNORM);
|
||||
|
@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
|
|||
INIT_LIST_HEAD(&rs->rs_recv_queue);
|
||||
INIT_LIST_HEAD(&rs->rs_notify_queue);
|
||||
INIT_LIST_HEAD(&rs->rs_cong_list);
|
||||
skb_queue_head_init(&rs->rs_zcookie_queue);
|
||||
spin_lock_init(&rs->rs_rdma_lock);
|
||||
rs->rs_rdma_keys = RB_ROOT;
|
||||
rs->rs_rx_traces = 0;
|
||||
|
|
|
@ -58,32 +58,26 @@ EXPORT_SYMBOL_GPL(rds_message_addref);
|
|||
|
||||
static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
|
||||
{
|
||||
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
|
||||
int ncookies;
|
||||
u32 *ptr;
|
||||
struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb;
|
||||
int ncookies = ck->num;
|
||||
|
||||
if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
|
||||
if (ncookies == RDS_MAX_ZCOOKIES)
|
||||
return false;
|
||||
ncookies = serr->ee.ee_data;
|
||||
if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
|
||||
return false;
|
||||
ptr = skb_put(skb, sizeof(u32));
|
||||
*ptr = cookie;
|
||||
serr->ee.ee_data = ++ncookies;
|
||||
ck->cookies[ncookies] = cookie;
|
||||
ck->num = ++ncookies;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void rds_rm_zerocopy_callback(struct rds_sock *rs,
|
||||
struct rds_znotifier *znotif)
|
||||
{
|
||||
struct sock *sk = rds_rs_to_sk(rs);
|
||||
struct sk_buff *skb, *tail;
|
||||
struct sock_exterr_skb *serr;
|
||||
unsigned long flags;
|
||||
struct sk_buff_head *q;
|
||||
u32 cookie = znotif->z_cookie;
|
||||
struct rds_zcopy_cookies *ck;
|
||||
|
||||
q = &sk->sk_error_queue;
|
||||
q = &rs->rs_zcookie_queue;
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
tail = skb_peek_tail(q);
|
||||
|
||||
|
@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
|
|||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
mm_unaccount_pinned_pages(&znotif->z_mmp);
|
||||
consume_skb(rds_skb_from_znotifier(znotif));
|
||||
sk->sk_error_report(sk);
|
||||
/* caller invokes rds_wake_sk_sleep() */
|
||||
return;
|
||||
}
|
||||
|
||||
skb = rds_skb_from_znotifier(znotif);
|
||||
serr = SKB_EXT_ERR(skb);
|
||||
memset(&serr->ee, 0, sizeof(serr->ee));
|
||||
serr->ee.ee_errno = 0;
|
||||
serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
|
||||
serr->ee.ee_info = 0;
|
||||
ck = (struct rds_zcopy_cookies *)skb->cb;
|
||||
memset(ck, 0, sizeof(*ck));
|
||||
WARN_ON(!skb_zcookie_add(skb, cookie));
|
||||
|
||||
__skb_queue_tail(q, skb);
|
||||
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
sk->sk_error_report(sk);
|
||||
/* caller invokes rds_wake_sk_sleep() */
|
||||
|
||||
mm_unaccount_pinned_pages(&znotif->z_mmp);
|
||||
}
|
||||
|
@ -129,6 +120,7 @@ static void rds_message_purge(struct rds_message *rm)
|
|||
if (rm->data.op_mmp_znotifier) {
|
||||
zcopy = true;
|
||||
rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
|
||||
rds_wake_sk_sleep(rs);
|
||||
rm->data.op_mmp_znotifier = NULL;
|
||||
}
|
||||
sock_put(rds_rs_to_sk(rs));
|
||||
|
@ -362,10 +354,12 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
|
|||
int total_copied = 0;
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
|
||||
GFP_KERNEL);
|
||||
skb = alloc_skb(0, GFP_KERNEL);
|
||||
if (!skb)
|
||||
return -ENOMEM;
|
||||
BUILD_BUG_ON(sizeof(skb->cb) <
|
||||
max_t(int, sizeof(struct rds_znotifier),
|
||||
sizeof(struct rds_zcopy_cookies)));
|
||||
rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
|
||||
if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
|
||||
length)) {
|
||||
|
|
|
@ -603,6 +603,8 @@ struct rds_sock {
|
|||
/* Socket receive path trace points*/
|
||||
u8 rs_rx_traces;
|
||||
u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
|
||||
|
||||
struct sk_buff_head rs_zcookie_queue;
|
||||
};
|
||||
|
||||
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
|
||||
|
|
|
@ -577,6 +577,32 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct sk_buff_head *q = &rs->rs_zcookie_queue;
|
||||
struct rds_zcopy_cookies *done;
|
||||
|
||||
if (!msg->msg_control)
|
||||
return false;
|
||||
|
||||
if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
|
||||
msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
|
||||
return false;
|
||||
|
||||
skb = skb_dequeue(q);
|
||||
if (!skb)
|
||||
return false;
|
||||
done = (struct rds_zcopy_cookies *)skb->cb;
|
||||
if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
|
||||
done)) {
|
||||
skb_queue_head(q, skb);
|
||||
return false;
|
||||
}
|
||||
consume_skb(skb);
|
||||
return true;
|
||||
}
|
||||
|
||||
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
||||
int msg_flags)
|
||||
{
|
||||
|
@ -611,7 +637,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
|||
|
||||
if (!rds_next_incoming(rs, &inc)) {
|
||||
if (nonblock) {
|
||||
ret = -EAGAIN;
|
||||
bool reaped = rds_recvmsg_zcookie(rs, msg);
|
||||
|
||||
ret = reaped ? 0 : -EAGAIN;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -660,6 +688,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
|||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
rds_recvmsg_zcookie(rs, msg);
|
||||
|
||||
rds_stats_inc(s_recv_delivered);
|
||||
|
||||
|
|
|
@ -344,27 +344,53 @@ static int do_setup_tx(int domain, int type, int protocol)
|
|||
return fd;
|
||||
}
|
||||
|
||||
static int do_process_zerocopy_cookies(struct sock_extended_err *serr,
|
||||
uint32_t *ckbuf, size_t nbytes)
|
||||
static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
|
||||
{
|
||||
int ncookies, i;
|
||||
int i;
|
||||
|
||||
if (serr->ee_errno != 0)
|
||||
error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
|
||||
ncookies = serr->ee_data;
|
||||
if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES)
|
||||
if (ck->num > RDS_MAX_ZCOOKIES)
|
||||
error(1, 0, "Returned %d cookies, max expected %d\n",
|
||||
ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES);
|
||||
if (nbytes != ncookies * sizeof(uint32_t))
|
||||
error(1, 0, "Expected %d cookies, got %ld\n",
|
||||
ncookies, nbytes/sizeof(uint32_t));
|
||||
for (i = 0; i < ncookies; i++)
|
||||
ck->num, RDS_MAX_ZCOOKIES);
|
||||
for (i = 0; i < ck->num; i++)
|
||||
if (cfg_verbose >= 2)
|
||||
fprintf(stderr, "%d\n", ckbuf[i]);
|
||||
return ncookies;
|
||||
fprintf(stderr, "%d\n", ck->cookies[i]);
|
||||
return ck->num;
|
||||
}
|
||||
|
||||
static bool do_recv_completion(int fd)
|
||||
static bool do_recvmsg_completion(int fd)
|
||||
{
|
||||
char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
|
||||
struct rds_zcopy_cookies *ck;
|
||||
struct cmsghdr *cmsg;
|
||||
struct msghdr msg;
|
||||
bool ret = false;
|
||||
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.msg_control = cmsgbuf;
|
||||
msg.msg_controllen = sizeof(cmsgbuf);
|
||||
|
||||
if (recvmsg(fd, &msg, MSG_DONTWAIT))
|
||||
return ret;
|
||||
|
||||
if (msg.msg_flags & MSG_CTRUNC)
|
||||
error(1, errno, "recvmsg notification: truncated");
|
||||
|
||||
for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
|
||||
if (cmsg->cmsg_level == SOL_RDS &&
|
||||
cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
|
||||
|
||||
ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
|
||||
completions += do_process_zerocopy_cookies(ck);
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
error(0, 0, "ignoring cmsg at level %d type %d\n",
|
||||
cmsg->cmsg_level, cmsg->cmsg_type);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool do_recv_completion(int fd, int domain)
|
||||
{
|
||||
struct sock_extended_err *serr;
|
||||
struct msghdr msg = {};
|
||||
|
@ -372,17 +398,13 @@ static bool do_recv_completion(int fd)
|
|||
uint32_t hi, lo, range;
|
||||
int ret, zerocopy;
|
||||
char control[100];
|
||||
uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES];
|
||||
struct iovec iov;
|
||||
|
||||
if (domain == PF_RDS)
|
||||
return do_recvmsg_completion(fd);
|
||||
|
||||
msg.msg_control = control;
|
||||
msg.msg_controllen = sizeof(control);
|
||||
|
||||
iov.iov_base = ckbuf;
|
||||
iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0]));
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
|
||||
if (ret == -1 && errno == EAGAIN)
|
||||
return false;
|
||||
|
@ -402,10 +424,6 @@ static bool do_recv_completion(int fd)
|
|||
|
||||
serr = (void *) CMSG_DATA(cm);
|
||||
|
||||
if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) {
|
||||
completions += do_process_zerocopy_cookies(serr, ckbuf, ret);
|
||||
return true;
|
||||
}
|
||||
if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
|
||||
error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
|
||||
if (serr->ee_errno != 0)
|
||||
|
@ -440,20 +458,20 @@ static bool do_recv_completion(int fd)
|
|||
}
|
||||
|
||||
/* Read all outstanding messages on the errqueue */
|
||||
static void do_recv_completions(int fd)
|
||||
static void do_recv_completions(int fd, int domain)
|
||||
{
|
||||
while (do_recv_completion(fd)) {}
|
||||
while (do_recv_completion(fd, domain)) {}
|
||||
}
|
||||
|
||||
/* Wait for all remaining completions on the errqueue */
|
||||
static void do_recv_remaining_completions(int fd)
|
||||
static void do_recv_remaining_completions(int fd, int domain)
|
||||
{
|
||||
int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
|
||||
|
||||
while (completions < expected_completions &&
|
||||
gettimeofday_ms() < tstop) {
|
||||
if (do_poll(fd, POLLERR))
|
||||
do_recv_completions(fd);
|
||||
if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
|
||||
do_recv_completions(fd, domain);
|
||||
}
|
||||
|
||||
if (completions < expected_completions)
|
||||
|
@ -534,13 +552,13 @@ static void do_tx(int domain, int type, int protocol)
|
|||
|
||||
while (!do_poll(fd, POLLOUT)) {
|
||||
if (cfg_zerocopy)
|
||||
do_recv_completions(fd);
|
||||
do_recv_completions(fd, domain);
|
||||
}
|
||||
|
||||
} while (gettimeofday_ms() < tstop);
|
||||
|
||||
if (cfg_zerocopy)
|
||||
do_recv_remaining_completions(fd);
|
||||
do_recv_remaining_completions(fd, domain);
|
||||
|
||||
if (close(fd))
|
||||
error(1, errno, "close");
|
||||
|
@ -631,40 +649,6 @@ static void do_flush_datagram(int fd, int type)
|
|||
bytes += cfg_payload_len;
|
||||
}
|
||||
|
||||
|
||||
static void do_recvmsg(int fd)
|
||||
{
|
||||
int ret, off = 0;
|
||||
char *buf;
|
||||
struct iovec iov;
|
||||
struct msghdr msg;
|
||||
struct sockaddr_storage din;
|
||||
|
||||
buf = calloc(cfg_payload_len, sizeof(char));
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = cfg_payload_len;
|
||||
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.msg_name = &din;
|
||||
msg.msg_namelen = sizeof(din);
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
ret = recvmsg(fd, &msg, MSG_TRUNC);
|
||||
|
||||
if (ret == -1)
|
||||
error(1, errno, "recv");
|
||||
if (ret != cfg_payload_len)
|
||||
error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
|
||||
|
||||
if (memcmp(buf + off, payload, ret))
|
||||
error(1, 0, "recv: data mismatch");
|
||||
|
||||
free(buf);
|
||||
packets++;
|
||||
bytes += cfg_payload_len;
|
||||
}
|
||||
|
||||
static void do_rx(int domain, int type, int protocol)
|
||||
{
|
||||
uint64_t tstop;
|
||||
|
@ -676,8 +660,6 @@ static void do_rx(int domain, int type, int protocol)
|
|||
do {
|
||||
if (type == SOCK_STREAM)
|
||||
do_flush_tcp(fd);
|
||||
else if (domain == PF_RDS)
|
||||
do_recvmsg(fd);
|
||||
else
|
||||
do_flush_datagram(fd, type);
|
||||
|
||||
|
|
Loading…
Reference in New Issue