From ae67bd3821bb0a54d97e7883d211196637d487a9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:44 -0400 Subject: [PATCH 01/69] SUNRPC: Fix up task signalling The RPC_TASK_KILLED flag should really not be set from another context because it can clobber data in the struct task when task->tk_flags is changed non-atomically. Let's therefore swap out RPC_TASK_KILLED with an atomic flag, and add a function to set that flag and safely wake up the task. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/lockd/clntproc.c | 4 ++-- fs/nfsd/nfs4callback.c | 4 ++-- include/linux/sunrpc/sched.h | 6 ++++-- include/trace/events/sunrpc.h | 6 +++--- net/sunrpc/clnt.c | 14 ++------------ net/sunrpc/sched.c | 28 +++++++++++++++++++++++----- net/sunrpc/xprt.c | 4 ++++ 7 files changed, 40 insertions(+), 26 deletions(-) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index e8a004097d18..d9c32d1a20c0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -715,7 +715,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) struct nlm_rqst *req = data; u32 status = ntohl(req->a_res.status); - if (RPC_ASSASSINATED(task)) + if (RPC_SIGNALLED(task)) goto die; if (task->tk_status < 0) { @@ -783,7 +783,7 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) struct nlm_rqst *req = data; u32 status = ntohl(req->a_res.status); - if (RPC_ASSASSINATED(task)) + if (RPC_SIGNALLED(task)) goto die; if (task->tk_status < 0) { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index d219159b98af..f7494be8dbe2 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1032,7 +1032,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback * the submission code will error out, so we don't need to * handle that case here. */ - if (task->tk_flags & RPC_TASK_KILLED) + if (RPC_SIGNALLED(task)) goto need_restart; return true; @@ -1081,7 +1081,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); - if (task->tk_flags & RPC_TASK_KILLED) + if (RPC_SIGNALLED(task)) goto need_restart; out: return ret; diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 52d41d0c1ae1..90e06c67f455 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -125,7 +125,6 @@ struct rpc_task_setup { #define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ #define RPC_TASK_ROOTCREDS 0x0040 /* force root creds */ #define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ -#define RPC_TASK_KILLED 0x0100 /* task was killed */ #define RPC_TASK_SOFT 0x0200 /* Use soft timeouts */ #define RPC_TASK_SOFTCONN 0x0400 /* Fail if can't connect */ #define RPC_TASK_SENT 0x0800 /* message was sent */ @@ -135,7 +134,6 @@ struct rpc_task_setup { #define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC) #define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER) -#define RPC_ASSASSINATED(t) ((t)->tk_flags & RPC_TASK_KILLED) #define RPC_IS_SOFT(t) ((t)->tk_flags & (RPC_TASK_SOFT|RPC_TASK_TIMEOUT)) #define RPC_IS_SOFTCONN(t) ((t)->tk_flags & RPC_TASK_SOFTCONN) #define RPC_WAS_SENT(t) ((t)->tk_flags & RPC_TASK_SENT) @@ -146,6 +144,7 @@ struct rpc_task_setup { #define RPC_TASK_NEED_XMIT 3 #define RPC_TASK_NEED_RECV 4 #define RPC_TASK_MSG_PIN_WAIT 5 +#define RPC_TASK_SIGNALLED 6 #define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) @@ -169,6 +168,8 @@ struct rpc_task_setup { #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) +#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) + /* * Task priorities. * Note: if you change these, you must also change @@ -217,6 +218,7 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); +void rpc_signal_task(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_exit(struct rpc_task *, int); void rpc_release_calldata(const struct rpc_call_ops *, void *); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 7e899e635d33..5e3b77d9daa7 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -82,7 +82,6 @@ TRACE_DEFINE_ENUM(RPC_TASK_SWAPPER); TRACE_DEFINE_ENUM(RPC_CALL_MAJORSEEN); TRACE_DEFINE_ENUM(RPC_TASK_ROOTCREDS); TRACE_DEFINE_ENUM(RPC_TASK_DYNAMIC); -TRACE_DEFINE_ENUM(RPC_TASK_KILLED); TRACE_DEFINE_ENUM(RPC_TASK_SOFT); TRACE_DEFINE_ENUM(RPC_TASK_SOFTCONN); TRACE_DEFINE_ENUM(RPC_TASK_SENT); @@ -97,7 +96,6 @@ TRACE_DEFINE_ENUM(RPC_TASK_NO_RETRANS_TIMEOUT); { RPC_CALL_MAJORSEEN, "MAJORSEEN" }, \ { RPC_TASK_ROOTCREDS, "ROOTCREDS" }, \ { RPC_TASK_DYNAMIC, "DYNAMIC" }, \ - { RPC_TASK_KILLED, "KILLED" }, \ { RPC_TASK_SOFT, "SOFT" }, \ { RPC_TASK_SOFTCONN, "SOFTCONN" }, \ { RPC_TASK_SENT, "SENT" }, \ @@ -111,6 +109,7 @@ TRACE_DEFINE_ENUM(RPC_TASK_ACTIVE); TRACE_DEFINE_ENUM(RPC_TASK_NEED_XMIT); TRACE_DEFINE_ENUM(RPC_TASK_NEED_RECV); TRACE_DEFINE_ENUM(RPC_TASK_MSG_PIN_WAIT); +TRACE_DEFINE_ENUM(RPC_TASK_SIGNALLED); #define rpc_show_runstate(flags) \ __print_flags(flags, "|", \ @@ -119,7 +118,8 @@ TRACE_DEFINE_ENUM(RPC_TASK_MSG_PIN_WAIT); { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \ { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \ { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \ - { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }) + { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \ + { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" }) DECLARE_EVENT_CLASS(rpc_task_running, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ff11dc98d7f..18f5392aa550 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -827,14 +827,8 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) * Spin lock all_tasks to prevent changes... */ spin_lock(&clnt->cl_lock); - list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { - if (!RPC_IS_ACTIVATED(rovr)) - continue; - if (!(rovr->tk_flags & RPC_TASK_KILLED)) { - rovr->tk_flags |= RPC_TASK_KILLED; - rpc_exit(rovr, -EIO); - } - } + list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) + rpc_signal_task(rovr); spin_unlock(&clnt->cl_lock); } EXPORT_SYMBOL_GPL(rpc_killall_tasks); @@ -1477,8 +1471,6 @@ EXPORT_SYMBOL_GPL(rpc_force_rebind); int rpc_restart_call_prepare(struct rpc_task *task) { - if (RPC_ASSASSINATED(task)) - return 0; task->tk_action = call_start; task->tk_status = 0; if (task->tk_ops->rpc_call_prepare != NULL) @@ -1494,8 +1486,6 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); int rpc_restart_call(struct rpc_task *task) { - if (RPC_ASSASSINATED(task)) - return 0; task->tk_action = call_start; task->tk_status = 0; return 1; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 28956c70100a..3d6cb91ba598 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -759,8 +759,7 @@ static void rpc_reset_task_statistics(struct rpc_task *task) { task->tk_timeouts = 0; - task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_KILLED|RPC_TASK_SENT); - + task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_SENT); rpc_init_task_statistics(task); } @@ -773,7 +772,6 @@ void rpc_exit_task(struct rpc_task *task) if (task->tk_ops->rpc_call_done != NULL) { task->tk_ops->rpc_call_done(task, task->tk_calldata); if (task->tk_action != NULL) { - WARN_ON(RPC_ASSASSINATED(task)); /* Always release the RPC slot and buffer memory */ xprt_release(task); rpc_reset_task_statistics(task); @@ -781,6 +779,19 @@ void rpc_exit_task(struct rpc_task *task) } } +void rpc_signal_task(struct rpc_task *task) +{ + struct rpc_wait_queue *queue; + + if (!RPC_IS_ACTIVATED(task)) + return; + set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); + smp_mb__after_atomic(); + queue = READ_ONCE(task->tk_waitqueue); + if (queue) + rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS); +} + void rpc_exit(struct rpc_task *task, int status) { task->tk_status = status; @@ -836,6 +847,13 @@ static void __rpc_execute(struct rpc_task *task) */ if (!RPC_IS_QUEUED(task)) continue; + + /* + * Signalled tasks should exit rather than sleep. + */ + if (RPC_SIGNALLED(task)) + rpc_exit(task, -ERESTARTSYS); + /* * The queue->lock protects against races with * rpc_make_runnable(). @@ -861,7 +879,7 @@ static void __rpc_execute(struct rpc_task *task) status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, TASK_KILLABLE); - if (status == -ERESTARTSYS) { + if (status < 0) { /* * When a sync task receives a signal, it exits with * -ERESTARTSYS. In order to catch any callbacks that @@ -869,7 +887,7 @@ static void __rpc_execute(struct rpc_task *task) * break the loop here, but go around once more. */ dprintk("RPC: %5u got signal\n", task->tk_pid); - task->tk_flags |= RPC_TASK_KILLED; + set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); rpc_exit(task, -ERESTARTSYS); } dprintk("RPC: %5u sync task resuming\n", task->tk_pid); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d7117d241460..3a4156cb0134 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1337,6 +1337,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) if (status < 0) goto out_dequeue; } + if (RPC_SIGNALLED(task)) { + status = -ERESTARTSYS; + goto out_dequeue; + } } /* From 9e6fa0bb84beeff4dddb17d7c23e35135fe977c5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:45 -0400 Subject: [PATCH 02/69] SUNRPC: Refactor rpc_restart_call/rpc_restart_call_prepare Clean up. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 18f5392aa550..af1dfc2a8fb1 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1464,20 +1464,13 @@ void rpc_force_rebind(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_force_rebind); -/* - * Restart an (async) RPC call from the call_prepare state. - * Usually called from within the exit handler. - */ -int -rpc_restart_call_prepare(struct rpc_task *task) +static int +__rpc_restart_call(struct rpc_task *task, void (*action)(struct rpc_task *)) { - task->tk_action = call_start; task->tk_status = 0; - if (task->tk_ops->rpc_call_prepare != NULL) - task->tk_action = rpc_prepare_task; + task->tk_action = action; return 1; } -EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); /* * Restart an (async) RPC call. Usually called from within the @@ -1486,12 +1479,23 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); int rpc_restart_call(struct rpc_task *task) { - task->tk_action = call_start; - task->tk_status = 0; - return 1; + return __rpc_restart_call(task, call_start); } EXPORT_SYMBOL_GPL(rpc_restart_call); +/* + * Restart an (async) RPC call from the call_prepare state. + * Usually called from within the exit handler. + */ +int +rpc_restart_call_prepare(struct rpc_task *task) +{ + if (task->tk_ops->rpc_call_prepare != NULL) + return __rpc_restart_call(task, rpc_prepare_task); + return rpc_restart_call(task); +} +EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); + const char *rpc_proc_name(const struct rpc_task *task) { From 8ba6a92d0182091e0c2fa15c1a5b5458bac25fc3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:46 -0400 Subject: [PATCH 03/69] SUNRPC: Refactor xprt_request_wait_receive() Convert the transport callback to actually put the request to sleep instead of just setting a timeout. This is in preparation for rpc_sleep_on_timeout(). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 6 +- net/sunrpc/xprt.c | 79 ++++++++++++---------- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtsock.c | 8 +-- 5 files changed, 51 insertions(+), 46 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 3a391544299e..a6d9fce7f20e 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -143,7 +143,7 @@ struct rpc_xprt_ops { void (*buf_free)(struct rpc_task *task); void (*prepare_request)(struct rpc_rqst *req); int (*send_request)(struct rpc_rqst *req); - void (*set_retrans_timeout)(struct rpc_task *task); + void (*wait_for_reply_request)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_request)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); @@ -378,8 +378,8 @@ xprt_disable_swap(struct rpc_xprt *xprt) int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); int xprt_load_transport(const char *); -void xprt_set_retrans_timeout_def(struct rpc_task *task); -void xprt_set_retrans_timeout_rtt(struct rpc_task *task); +void xprt_wait_for_reply_request_def(struct rpc_task *task); +void xprt_wait_for_reply_request_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_xprt *xprt); bool xprt_write_space(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3a4156cb0134..5afffa669d04 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -554,41 +554,6 @@ bool xprt_write_space(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_write_space); -/** - * xprt_set_retrans_timeout_def - set a request's retransmit timeout - * @task: task whose timeout is to be set - * - * Set a request's retransmit timeout based on the transport's - * default timeout parameters. Used by transports that don't adjust - * the retransmit timeout based on round-trip time estimation. - */ -void xprt_set_retrans_timeout_def(struct rpc_task *task) -{ - task->tk_timeout = task->tk_rqstp->rq_timeout; -} -EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def); - -/** - * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout - * @task: task whose timeout is to be set - * - * Set a request's retransmit timeout using the RTT estimator. - */ -void xprt_set_retrans_timeout_rtt(struct rpc_task *task) -{ - int timer = task->tk_msg.rpc_proc->p_timer; - struct rpc_clnt *clnt = task->tk_client; - struct rpc_rtt *rtt = clnt->cl_rtt; - struct rpc_rqst *req = task->tk_rqstp; - unsigned long max_timeout = clnt->cl_timeout->to_maxval; - - task->tk_timeout = rpc_calc_rto(rtt, timer); - task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; - if (task->tk_timeout > max_timeout || task->tk_timeout == 0) - task->tk_timeout = max_timeout; -} -EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt); - static void xprt_reset_majortimeo(struct rpc_rqst *req) { const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; @@ -1102,6 +1067,47 @@ static void xprt_timer(struct rpc_task *task) task->tk_status = 0; } +/** + * xprt_wait_for_reply_request_def - wait for reply + * @task: pointer to rpc_task + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation, + * and put the task to sleep on the pending queue. + */ +void xprt_wait_for_reply_request_def(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&req->rq_xprt->pending, task, xprt_timer); +} +EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def); + +/** + * xprt_wait_for_reply_request_rtt - wait for reply using RTT estimator + * @task: pointer to rpc_task + * + * Set a request's retransmit timeout using the RTT estimator, + * and put the task to sleep on the pending queue. + */ +void xprt_wait_for_reply_request_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rtt *rtt = clnt->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = clnt->cl_timeout->to_maxval; + + task->tk_timeout = rpc_calc_rto(rtt, timer); + task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (task->tk_timeout > max_timeout || task->tk_timeout == 0) + task->tk_timeout = max_timeout; + rpc_sleep_on(&req->rq_xprt->pending, task, xprt_timer); +} +EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_rtt); + /** * xprt_request_wait_receive - wait for the reply to an RPC request * @task: RPC task about to send a request @@ -1121,8 +1127,7 @@ void xprt_request_wait_receive(struct rpc_task *task) */ spin_lock(&xprt->queue_lock); if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { - xprt->ops->set_retrans_timeout(task); - rpc_sleep_on(&xprt->pending, task, xprt_timer); + xprt->ops->wait_for_reply_request(task); /* * Send an extra queue wakeup call if the * connection was dropped in case the call to diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 907464c2a9f0..bed57d8b5c19 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -261,7 +261,7 @@ static const struct rpc_xprt_ops xprt_rdma_bc_procs = { .buf_alloc = xprt_rdma_bc_allocate, .buf_free = xprt_rdma_bc_free, .send_request = xprt_rdma_bc_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xprt_rdma_bc_close, .destroy = xprt_rdma_bc_put, .print_stats = xprt_rdma_print_stats diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 5d261353bd90..7e73abe01cfe 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -815,7 +815,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .alloc_slot = xprt_rdma_alloc_slot, .free_slot = xprt_rdma_free_slot, .release_request = xprt_release_rqst_cong, /* ditto */ - .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .wait_for_reply_request = xprt_wait_for_reply_request_def, /* ditto */ .timer = xprt_rdma_timer, .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ .set_port = xprt_rdma_set_port, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 732d4b57411a..b4b4b8db143c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2690,7 +2690,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_close, .destroy = xs_destroy, .print_stats = xs_local_print_stats, @@ -2710,7 +2710,7 @@ static const struct rpc_xprt_ops xs_udp_ops = { .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_udp_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .wait_for_reply_request = xprt_wait_for_reply_request_rtt, .timer = xs_udp_timer, .release_request = xprt_release_rqst_cong, .close = xs_close, @@ -2733,7 +2733,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, .set_connect_timeout = xs_tcp_set_connect_timeout, @@ -2761,7 +2761,7 @@ static const struct rpc_xprt_ops bc_tcp_ops = { .buf_alloc = bc_malloc, .buf_free = bc_free, .send_request = bc_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = bc_close, .destroy = bc_destroy, .print_stats = xs_tcp_print_stats, From 87150aaed9e55d8b18a94aa2589aa4331429fce8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:47 -0400 Subject: [PATCH 04/69] SUNRPC: Refactor rpc_sleep_on() rpc_sleep_on() does not need to set the task->tk_callback under the queue lock, so move that out. Also refactor the check for whether the task is active. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3d6cb91ba598..8e96a841dd11 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -362,7 +362,6 @@ static void rpc_make_runnable(struct workqueue_struct *wq, */ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action, unsigned char queue_priority) { dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", @@ -372,27 +371,39 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, __rpc_add_wait_queue(q, task, queue_priority); - WARN_ON_ONCE(task->tk_callback != NULL); - task->tk_callback = action; __rpc_add_timer(q, task); } +static void rpc_set_tk_callback(struct rpc_task *task, rpc_action action) +{ + if (action && !WARN_ON_ONCE(task->tk_callback != NULL)) + task->tk_callback = action; +} + +static bool rpc_sleep_check_activated(struct rpc_task *task) +{ + /* We shouldn't ever put an inactive task to sleep */ + if (WARN_ON_ONCE(!RPC_IS_ACTIVATED(task))) { + task->tk_status = -EIO; + rpc_put_task_async(task); + return false; + } + return true; +} + void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action) { - /* We shouldn't ever put an inactive task to sleep */ - WARN_ON_ONCE(!RPC_IS_ACTIVATED(task)); - if (!RPC_IS_ACTIVATED(task)) { - task->tk_status = -EIO; - rpc_put_task_async(task); + if (!rpc_sleep_check_activated(task)) return; - } + + rpc_set_tk_callback(task, action); /* * Protect the queue operations. */ spin_lock_bh(&q->lock); - __rpc_sleep_on_priority(q, task, action, task->tk_priority); + __rpc_sleep_on_priority(q, task, task->tk_priority); spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on); @@ -400,19 +411,16 @@ EXPORT_SYMBOL_GPL(rpc_sleep_on); void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, int priority) { - /* We shouldn't ever put an inactive task to sleep */ - WARN_ON_ONCE(!RPC_IS_ACTIVATED(task)); - if (!RPC_IS_ACTIVATED(task)) { - task->tk_status = -EIO; - rpc_put_task_async(task); + if (!rpc_sleep_check_activated(task)) return; - } + + rpc_set_tk_callback(task, action); /* * Protect the queue operations. */ spin_lock_bh(&q->lock); - __rpc_sleep_on_priority(q, task, action, priority - RPC_PRIORITY_LOW); + __rpc_sleep_on_priority(q, task, priority - RPC_PRIORITY_LOW); spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); From 8357a9b60fe7500699a9dec540ca1c48df3cb455 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:48 -0400 Subject: [PATCH 05/69] SUNRPC: Remove unused argument 'action' from rpc_sleep_on_priority() None of the callers set the 'action' argument, so let's just remove it. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- include/linux/sunrpc/sched.h | 1 - net/sunrpc/sched.c | 7 +++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 741ff8c9c6ed..222eccd8a715 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1008,7 +1008,7 @@ int nfs4_setup_sequence(struct nfs_client *client, out_sleep: if (args->sa_privileged) rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, - NULL, RPC_PRIORITY_PRIVILEGED); + RPC_PRIORITY_PRIVILEGED); else rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 90e06c67f455..5d517578a018 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -231,7 +231,6 @@ void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action); void rpc_sleep_on_priority(struct rpc_wait_queue *, struct rpc_task *, - rpc_action action, int priority); void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 8e96a841dd11..04170c08b2cf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -409,18 +409,17 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, EXPORT_SYMBOL_GPL(rpc_sleep_on); void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action, int priority) + int priority) { if (!rpc_sleep_check_activated(task)) return; - rpc_set_tk_callback(task, action); - + priority -= RPC_PRIORITY_LOW; /* * Protect the queue operations. */ spin_lock_bh(&q->lock); - __rpc_sleep_on_priority(q, task, priority - RPC_PRIORITY_LOW); + __rpc_sleep_on_priority(q, task, priority); spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); From 6b2e6856275d7b8d0acbf06d2e8da72e1a6bc857 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:49 -0400 Subject: [PATCH 06/69] SUNRPC: Add function rpc_sleep_on_timeout() Clean up the RPC task sleep interfaces by replacing the task->tk_timeout 'hidden parameter' to rpc_sleep_on() with a new function that takes an absolute timeout. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 18 ++++++--- include/linux/sunrpc/sched.h | 9 ++++- net/sunrpc/auth_gss/auth_gss.c | 5 +-- net/sunrpc/clnt.c | 1 - net/sunrpc/rpcb_clnt.c | 3 +- net/sunrpc/sched.c | 73 +++++++++++++++++++++++++++------- net/sunrpc/xprt.c | 36 ++++++++++------- 7 files changed, 103 insertions(+), 42 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 222eccd8a715..96b988689f0e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -978,10 +978,8 @@ int nfs4_setup_sequence(struct nfs_client *client, if (res->sr_slot != NULL) goto out_start; - if (session) { + if (session) tbl = &session->fc_slot_table; - task->tk_timeout = 0; - } spin_lock(&tbl->slot_tbl_lock); /* The state manager will wait until the slot table is empty */ @@ -990,9 +988,8 @@ int nfs4_setup_sequence(struct nfs_client *client, slot = nfs4_alloc_slot(tbl); if (IS_ERR(slot)) { - /* Try again in 1/4 second */ if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; + goto out_sleep_timeout; goto out_sleep; } spin_unlock(&tbl->slot_tbl_lock); @@ -1004,7 +1001,16 @@ int nfs4_setup_sequence(struct nfs_client *client, nfs41_sequence_res_init(res); rpc_call_start(task); return 0; - +out_sleep_timeout: + /* Try again in 1/4 second */ + if (args->sa_privileged) + rpc_sleep_on_priority_timeout(&tbl->slot_tbl_waitq, task, + jiffies + (HZ >> 2), RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on_timeout(&tbl->slot_tbl_waitq, task, + NULL, jiffies + (HZ >> 2)); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; out_sleep: if (args->sa_privileged) rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 5d517578a018..61ba533ec058 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -35,7 +35,6 @@ struct rpc_wait { struct list_head list; /* wait queue links */ struct list_head links; /* Links to related tasks */ struct list_head timer_list; /* Timer list */ - unsigned long expires; }; /* @@ -227,8 +226,16 @@ void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_destroy_wait_queue(struct rpc_wait_queue *); +void rpc_sleep_on_timeout(struct rpc_wait_queue *queue, + struct rpc_task *task, + rpc_action action, + unsigned long timeout); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action); +void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *queue, + struct rpc_task *task, + unsigned long timeout, + int priority); void rpc_sleep_on_priority(struct rpc_wait_queue *, struct rpc_task *, int priority); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 3fd56c0c90ae..c055edfec55e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -581,8 +581,8 @@ gss_refresh_upcall(struct rpc_task *task) /* XXX: warning on the first, under the assumption we * shouldn't normally hit this case on a refresh. */ warn_gssd(); - task->tk_timeout = 15*HZ; - rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL); + rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue, + task, NULL, jiffies + (15 * HZ)); err = -EAGAIN; goto out; } @@ -595,7 +595,6 @@ gss_refresh_upcall(struct rpc_task *task) if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { - task->tk_timeout = 0; gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ refcount_inc(&gss_msg->count); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index af1dfc2a8fb1..216d5e5e3b54 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1851,7 +1851,6 @@ call_bind(struct rpc_task *task) if (!xprt_prepare_transmit(task)) return; - task->tk_timeout = xprt->bind_timeout; xprt->ops->rpcbind(task); } diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 41a971ac1c63..18b0cf2a923f 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -694,7 +694,8 @@ void rpcb_getport_async(struct rpc_task *task) /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ - rpc_sleep_on(&xprt->binding, task, NULL); + rpc_sleep_on_timeout(&xprt->binding, task, + NULL, jiffies + xprt->bind_timeout); if (xprt_test_and_set_binding(xprt)) { dprintk("RPC: %5u %s: waiting for another binder\n", diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 04170c08b2cf..7e0f7b83262f 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -66,7 +66,7 @@ struct workqueue_struct *xprtiod_workqueue __read_mostly; static void __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) { - if (task->tk_timeout == 0) + if (list_empty(&task->u.tk_wait.timer_list)) return; dprintk("RPC: %5u disabling timer\n", task->tk_pid); task->tk_timeout = 0; @@ -86,17 +86,15 @@ rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) * Set up a timer for the current task. */ static void -__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) +__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task, + unsigned long timeout) { - if (!task->tk_timeout) - return; - dprintk("RPC: %5u setting alarm for %u ms\n", - task->tk_pid, jiffies_to_msecs(task->tk_timeout)); + task->tk_pid, jiffies_to_msecs(timeout - jiffies)); - task->u.tk_wait.expires = jiffies + task->tk_timeout; - if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) - rpc_set_queue_timer(queue, task->u.tk_wait.expires); + task->tk_timeout = timeout; + if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires)) + rpc_set_queue_timer(queue, timeout); list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } @@ -188,6 +186,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, if (RPC_IS_QUEUED(task)) return; + INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); else if (RPC_IS_SWAPPER(task)) @@ -371,7 +370,17 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, __rpc_add_wait_queue(q, task, queue_priority); - __rpc_add_timer(q, task); +} + +static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, + struct rpc_task *task, unsigned long timeout, + unsigned char queue_priority) +{ + if (time_is_after_jiffies(timeout)) { + __rpc_sleep_on_priority(q, task, queue_priority); + __rpc_add_timer(q, task, timeout); + } else + task->tk_status = -ETIMEDOUT; } static void rpc_set_tk_callback(struct rpc_task *task, rpc_action action) @@ -391,14 +400,32 @@ static bool rpc_sleep_check_activated(struct rpc_task *task) return true; } -void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action) +void rpc_sleep_on_timeout(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, unsigned long timeout) { if (!rpc_sleep_check_activated(task)) return; rpc_set_tk_callback(task, action); + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on_priority_timeout(q, task, timeout, task->tk_priority); + spin_unlock_bh(&q->lock); +} +EXPORT_SYMBOL_GPL(rpc_sleep_on_timeout); + +void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action) +{ + if (!rpc_sleep_check_activated(task)) + return; + + rpc_set_tk_callback(task, action); + + WARN_ON_ONCE(task->tk_timeout != 0); /* * Protect the queue operations. */ @@ -408,12 +435,29 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, } EXPORT_SYMBOL_GPL(rpc_sleep_on); +void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, + struct rpc_task *task, unsigned long timeout, int priority) +{ + if (!rpc_sleep_check_activated(task)) + return; + + priority -= RPC_PRIORITY_LOW; + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on_priority_timeout(q, task, timeout, priority); + spin_unlock_bh(&q->lock); +} +EXPORT_SYMBOL_GPL(rpc_sleep_on_priority_timeout); + void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, int priority) { if (!rpc_sleep_check_activated(task)) return; + WARN_ON_ONCE(task->tk_timeout != 0); priority -= RPC_PRIORITY_LOW; /* * Protect the queue operations. @@ -711,7 +755,7 @@ static void __rpc_queue_timer_fn(struct timer_list *t) spin_lock(&queue->lock); expires = now = jiffies; list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { - timeo = task->u.tk_wait.expires; + timeo = task->tk_timeout; if (time_after_eq(now, timeo)) { dprintk("RPC: %5u timeout\n", task->tk_pid); task->tk_status = -ETIMEDOUT; @@ -737,8 +781,7 @@ static void __rpc_atrun(struct rpc_task *task) */ void rpc_delay(struct rpc_task *task, unsigned long delay) { - task->tk_timeout = delay; - rpc_sleep_on(&delay_queue, task, __rpc_atrun); + rpc_sleep_on_timeout(&delay_queue, task, __rpc_atrun, jiffies + delay); } EXPORT_SYMBOL_GPL(rpc_delay); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 5afffa669d04..7c3623b17493 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -209,9 +209,12 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - rpc_sleep_on(&xprt->sending, task, NULL); + if (RPC_IS_SOFT(task)) + rpc_sleep_on_timeout(&xprt->sending, task, NULL, + jiffies + req->rq_timeout); + else + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); @@ -273,9 +276,12 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - rpc_sleep_on(&xprt->sending, task, NULL); + if (RPC_IS_SOFT(task)) + rpc_sleep_on_timeout(&xprt->sending, task, NULL, + jiffies + req->rq_timeout); + else + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -787,9 +793,9 @@ void xprt_connect(struct rpc_task *task) xprt->ops->close(xprt); if (!xprt_connected(xprt)) { - task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; - rpc_sleep_on(&xprt->pending, task, NULL); + rpc_sleep_on_timeout(&xprt->pending, task, NULL, + jiffies + task->tk_rqstp->rq_timeout); if (test_bit(XPRT_CLOSING, &xprt->state)) return; @@ -1080,8 +1086,8 @@ void xprt_wait_for_reply_request_def(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&req->rq_xprt->pending, task, xprt_timer); + rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, + jiffies + req->rq_timeout); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def); @@ -1099,12 +1105,14 @@ void xprt_wait_for_reply_request_rtt(struct rpc_task *task) struct rpc_rtt *rtt = clnt->cl_rtt; struct rpc_rqst *req = task->tk_rqstp; unsigned long max_timeout = clnt->cl_timeout->to_maxval; + unsigned long timeout; - task->tk_timeout = rpc_calc_rto(rtt, timer); - task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; - if (task->tk_timeout > max_timeout || task->tk_timeout == 0) - task->tk_timeout = max_timeout; - rpc_sleep_on(&req->rq_xprt->pending, task, xprt_timer); + timeout = rpc_calc_rto(rtt, timer); + timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (timeout > max_timeout || timeout == 0) + timeout = max_timeout; + rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, + jiffies + timeout); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_rtt); @@ -1656,7 +1664,6 @@ void xprt_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - task->tk_timeout = 0; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) xprt_do_reserve(xprt, task); @@ -1679,7 +1686,6 @@ void xprt_retry_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - task->tk_timeout = 0; task->tk_status = -EAGAIN; xprt_do_reserve(xprt, task); } From 5efd1876e61fe61b61e2d056782027c11bcd0982 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:50 -0400 Subject: [PATCH 07/69] SUNRPC: Fix up tracking of timeouts Add a helper to ensure that debugfs and friends print out the correct current task timeout value. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 1 + include/trace/events/sunrpc.h | 2 +- net/sunrpc/clnt.c | 2 +- net/sunrpc/debugfs.c | 2 +- net/sunrpc/sched.c | 14 ++++++++++++++ 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 61ba533ec058..c5ad02c7a4b3 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -226,6 +226,7 @@ void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_destroy_wait_queue(struct rpc_wait_queue *); +unsigned long rpc_task_timeout(const struct rpc_task *task); void rpc_sleep_on_timeout(struct rpc_wait_queue *queue, struct rpc_task *task, rpc_action action, diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5e3b77d9daa7..dd301db64521 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -186,7 +186,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued, __entry->client_id = task->tk_client ? task->tk_client->cl_clid : -1; __entry->task_id = task->tk_pid; - __entry->timeout = task->tk_timeout; + __entry->timeout = rpc_task_timeout(task); __entry->runstate = task->tk_runstate; __entry->status = task->tk_status; __entry->flags = task->tk_flags; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 216d5e5e3b54..b25f317d0ee2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2874,7 +2874,7 @@ static void rpc_show_task(const struct rpc_clnt *clnt, printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n", task->tk_pid, task->tk_flags, task->tk_status, - clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops, + clnt, task->tk_rqstp, rpc_task_timeout(task), task->tk_ops, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), task->tk_action, rpc_waitq); } diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 19bb356230ed..95ebd76b132d 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -33,7 +33,7 @@ tasks_show(struct seq_file *f, void *v) seq_printf(f, "%5u %04x %6d 0x%x 0x%x %8ld %ps %sv%u %s a:%ps q:%s\n", task->tk_pid, task->tk_flags, task->tk_status, - clnt->cl_clid, xid, task->tk_timeout, task->tk_ops, + clnt->cl_clid, xid, rpc_task_timeout(task), task->tk_ops, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), task->tk_action, rpc_waitq); return 0; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 7e0f7b83262f..40944c34a9e4 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -58,6 +58,20 @@ static struct rpc_wait_queue delay_queue; struct workqueue_struct *rpciod_workqueue __read_mostly; struct workqueue_struct *xprtiod_workqueue __read_mostly; +unsigned long +rpc_task_timeout(const struct rpc_task *task) +{ + unsigned long timeout = READ_ONCE(task->tk_timeout); + + if (timeout != 0) { + unsigned long now = jiffies; + if (time_before(now, timeout)) + return timeout - now; + } + return 0; +} +EXPORT_SYMBOL_GPL(rpc_task_timeout); + /* * Disable the timer for a given RPC task. Should be called with * queue->lock and bh_disabled in order to avoid races within From 24a9d9a21e568f494198eea2bb864e0b6c593051 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:51 -0400 Subject: [PATCH 08/69] SUNRPC: Simplify queue timeouts using timer_reduce() Simplify the setting of queue timeouts by using the timer_reduce() function. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 1 - net/sunrpc/sched.c | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index c5ad02c7a4b3..bf9d6ee7f00f 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -183,7 +183,6 @@ struct rpc_task_setup { struct rpc_timer { struct timer_list timer; struct list_head list; - unsigned long expires; }; /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 40944c34a9e4..301e0f7f1dc9 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -92,8 +92,7 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) static void rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) { - queue->timer_list.expires = expires; - mod_timer(&queue->timer_list.timer, expires); + timer_reduce(&queue->timer_list.timer, expires); } /* @@ -107,8 +106,7 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task, task->tk_pid, jiffies_to_msecs(timeout - jiffies)); task->tk_timeout = timeout; - if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires)) - rpc_set_queue_timer(queue, timeout); + rpc_set_queue_timer(queue, timeout); list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } From 431235818bc3a919ca7487500c67c3144feece80 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:52 -0400 Subject: [PATCH 09/69] SUNRPC: Declare RPC timers as TIMER_DEFERRABLE Don't wake idle CPUs only for the purpose of servicing an RPC queue timeout. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 4 +++- net/sunrpc/xprt.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 301e0f7f1dc9..1a12fb03e611 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -249,7 +249,9 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c queue->maxpriority = nr_queues - 1; rpc_reset_waitqueue_priority(queue); queue->qlen = 0; - timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0); + timer_setup(&queue->timer_list.timer, + __rpc_queue_timer_fn, + TIMER_DEFERRABLE); INIT_LIST_HEAD(&queue->timer_list.list); rpc_assign_waitqueue_name(queue, qname); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 7c3623b17493..36af1a1929af 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1842,7 +1842,9 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) - timer_setup(&xprt->timer, xprt_init_autodisconnect, 0); + timer_setup(&xprt->timer, + xprt_init_autodisconnect, + TIMER_DEFERRABLE); else timer_setup(&xprt->timer, NULL, 0); From 9e910bff74be819aad751e82270682f3c405d199 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:53 -0400 Subject: [PATCH 10/69] SUNRPC: Ensure that the transport layer respect major timeouts Ensure that when in the transport layer, we don't sleep past a major timeout. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 36af1a1929af..642cc0f64e44 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -73,6 +73,15 @@ static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); +static unsigned long xprt_request_timeout(const struct rpc_rqst *req) +{ + unsigned long timeout = jiffies + req->rq_timeout; + + if (time_before(timeout, req->rq_majortimeo)) + return timeout; + return req->rq_majortimeo; +} + /** * xprt_register_transport - register a transport implementation * @transport: transport to register @@ -212,7 +221,7 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, - jiffies + req->rq_timeout); + xprt_request_timeout(req)); else rpc_sleep_on(&xprt->sending, task, NULL); return 0; @@ -279,7 +288,7 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, - jiffies + req->rq_timeout); + xprt_request_timeout(req)); else rpc_sleep_on(&xprt->sending, task, NULL); return 0; @@ -795,7 +804,7 @@ void xprt_connect(struct rpc_task *task) if (!xprt_connected(xprt)) { task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on_timeout(&xprt->pending, task, NULL, - jiffies + task->tk_rqstp->rq_timeout); + xprt_request_timeout(task->tk_rqstp)); if (test_bit(XPRT_CLOSING, &xprt->state)) return; @@ -1087,7 +1096,7 @@ void xprt_wait_for_reply_request_def(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, - jiffies + req->rq_timeout); + xprt_request_timeout(req)); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def); From 5ad64b36dda962797ce3ed579a27189ec7482d0d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:54 -0400 Subject: [PATCH 11/69] SUNRPC: Add tracking of RPC level errors Add variables to track RPC level errors so that we can distinguish between issue that arose in the RPC transport layer as opposed to those arising from the reply message. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 2 ++ net/sunrpc/clnt.c | 40 ++++++++++++++++++++++++------------ net/sunrpc/xprtsock.c | 1 + 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index bf9d6ee7f00f..d0e451868f02 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -61,6 +61,8 @@ struct rpc_task { struct rpc_wait tk_wait; /* RPC wait */ } u; + int tk_rpc_status; /* Result of last RPC operation */ + /* * RPC call state */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b25f317d0ee2..315f9c9cb72d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1468,6 +1468,7 @@ static int __rpc_restart_call(struct rpc_task *task, void (*action)(struct rpc_task *)) { task->tk_status = 0; + task->tk_rpc_status = 0; task->tk_action = action; return 1; } @@ -1510,6 +1511,19 @@ const char return "no proc"; } +static void +__rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) +{ + task->tk_rpc_status = rpc_status; + rpc_exit(task, tk_status); +} + +static void +rpc_call_rpcerror(struct rpc_task *task, int status) +{ + __rpc_call_rpcerror(task, status, status); +} + /* * 0. Initial state * @@ -1574,7 +1588,7 @@ call_reserveresult(struct rpc_task *task) printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", __func__, status); - rpc_exit(task, -EIO); + rpc_call_rpcerror(task, -EIO); return; } @@ -1602,7 +1616,7 @@ call_reserveresult(struct rpc_task *task) __func__, status); break; } - rpc_exit(task, status); + rpc_call_rpcerror(task, status); } /* @@ -1670,7 +1684,7 @@ call_refreshresult(struct rpc_task *task) } dprintk("RPC: %5u %s: refresh creds failed with error %d\n", task->tk_pid, __func__, status); - rpc_exit(task, status); + rpc_call_rpcerror(task, status); } /* @@ -1721,7 +1735,7 @@ call_allocate(struct rpc_task *task) if (status == 0) return; if (status != -ENOMEM) { - rpc_exit(task, status); + rpc_call_rpcerror(task, status); return; } @@ -1790,7 +1804,7 @@ call_encode(struct rpc_task *task) task->tk_action = call_refresh; break; default: - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } return; } else { @@ -1931,7 +1945,7 @@ call_bind_status(struct rpc_task *task) task->tk_pid, -task->tk_status); } - rpc_exit(task, status); + rpc_call_rpcerror(task, status); return; retry_timeout: @@ -1966,7 +1980,7 @@ call_connect(struct rpc_task *task) if (task->tk_status < 0) return; if (task->tk_flags & RPC_TASK_NOCONNECT) { - rpc_exit(task, -ENOTCONN); + rpc_call_rpcerror(task, -ENOTCONN); return; } if (!xprt_prepare_transmit(task)) @@ -2026,7 +2040,7 @@ call_connect_status(struct rpc_task *task) task->tk_action = call_transmit; return; } - rpc_exit(task, status); + rpc_call_rpcerror(task, status); return; out_retry: /* Check for timeouts before looping back to call_bind */ @@ -2111,7 +2125,7 @@ call_transmit_status(struct rpc_task *task) if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); return; } /* fall through */ @@ -2275,7 +2289,7 @@ call_status(struct rpc_task *task) rpc_check_timeout(task); return; out_exit: - rpc_exit(task, status); + rpc_call_rpcerror(task, status); } static bool @@ -2299,7 +2313,7 @@ rpc_check_timeout(struct rpc_task *task) task->tk_timeouts++; if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) { - rpc_exit(task, -ETIMEDOUT); + rpc_call_rpcerror(task, -ETIMEDOUT); return; } @@ -2310,9 +2324,9 @@ rpc_check_timeout(struct rpc_task *task) task->tk_xprt->servername); } if (task->tk_flags & RPC_TASK_TIMEOUT) - rpc_exit(task, -ETIMEDOUT); + rpc_call_rpcerror(task, -ETIMEDOUT); else - rpc_exit(task, -EIO); + __rpc_call_rpcerror(task, -EIO, -ETIMEDOUT); return; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b4b4b8db143c..c69951ed2ebc 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2017,6 +2017,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) * we'll need to figure out how to pass a namespace to * connect. */ + task->tk_rpc_status = -ENOTCONN; rpc_exit(task, -ENOTCONN); return; } From e4ec48d3cc6139f4c1a934ff25d440cd4d50279f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:55 -0400 Subject: [PATCH 12/69] SUNRPC: Make "no retrans timeout" soft tasks behave like softconn for timeouts If a soft NFSv4 request is sent, then we don't need it to time out unless the connection breaks. The reason is that as long as the connection is unbroken, the protocol states that the server is not allowed to drop the request. IOW: as long as the connection remains unbroken, the client may assume that all transmitted RPC requests are being processed by the server, and that retransmissions and timeouts of those requests are unwarranted. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 315f9c9cb72d..43d6815f7391 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2318,6 +2318,15 @@ rpc_check_timeout(struct rpc_task *task) } if (RPC_IS_SOFT(task)) { + /* + * Once a "no retrans timeout" soft tasks (a.k.a NFSv4) has + * been sent, it should time out only if the transport + * connection gets terminally broken. + */ + if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) && + rpc_check_connected(task->tk_rqstp)) + return; + if (clnt->cl_chatty) { printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_program->name, From da953063bdce465d941751d981e8d3ac5e92906c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:56 -0400 Subject: [PATCH 13/69] SUNRPC: Start the first major timeout calculation at task creation When calculating the major timeout for a new task, when we know that the connection has been broken, use the task->tk_start to ensure that we also take into account the time spent waiting for a slot or session slot. This ensures that we fail over soft requests relatively quickly once the connection has actually been broken, and the first requests have started to fail. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 642cc0f64e44..bc1c8247750d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -569,18 +569,44 @@ bool xprt_write_space(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_write_space); -static void xprt_reset_majortimeo(struct rpc_rqst *req) +static unsigned long xprt_abs_ktime_to_jiffies(ktime_t abstime) +{ + s64 delta = ktime_to_ns(ktime_get() - abstime); + return likely(delta >= 0) ? + jiffies - nsecs_to_jiffies(delta) : + jiffies + nsecs_to_jiffies(-delta); +} + +static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req) { const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; + unsigned long majortimeo = req->rq_timeout; - req->rq_majortimeo = req->rq_timeout; if (to->to_exponential) - req->rq_majortimeo <<= to->to_retries; + majortimeo <<= to->to_retries; else - req->rq_majortimeo += to->to_increment * to->to_retries; - if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0) - req->rq_majortimeo = to->to_maxval; - req->rq_majortimeo += jiffies; + majortimeo += to->to_increment * to->to_retries; + if (majortimeo > to->to_maxval || majortimeo == 0) + majortimeo = to->to_maxval; + return majortimeo; +} + +static void xprt_reset_majortimeo(struct rpc_rqst *req) +{ + req->rq_majortimeo += xprt_calc_majortimeo(req); +} + +static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) +{ + unsigned long time_init; + struct rpc_xprt *xprt = req->rq_xprt; + + if (likely(xprt && xprt_connected(xprt))) + time_init = jiffies; + else + time_init = xprt_abs_ktime_to_jiffies(task->tk_start); + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_majortimeo = time_init + xprt_calc_majortimeo(req); } /** @@ -997,7 +1023,6 @@ xprt_request_enqueue_receive(struct rpc_task *task) set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); spin_unlock(&xprt->queue_lock); - xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); } @@ -1631,7 +1656,6 @@ xprt_request_init(struct rpc_task *task) struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; - req->rq_timeout = task->tk_client->cl_timeout->to_initval; req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; @@ -1644,7 +1668,7 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; - xprt_reset_majortimeo(req); + xprt_init_majortimeo(task, req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, req, ntohl(req->rq_xid)); } From 0729d995f2a2726598642d552ebe916b43aef73d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:57 -0400 Subject: [PATCH 14/69] SUNRPC: Ensure to ratelimit the "server not responding" syslog messages In particular, the timeout messages can be very noisy, so we ought to ratelimit them in order to avoid spamming the syslog. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 43d6815f7391..b16322eb6c42 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2328,7 +2328,8 @@ rpc_check_timeout(struct rpc_task *task) return; if (clnt->cl_chatty) { - printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + pr_notice_ratelimited( + "%s: server %s not responding, timed out\n", clnt->cl_program->name, task->tk_xprt->servername); } @@ -2342,9 +2343,10 @@ rpc_check_timeout(struct rpc_task *task) if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; if (clnt->cl_chatty) { - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_program->name, - task->tk_xprt->servername); + pr_notice_ratelimited( + "%s: server %s not responding, still trying\n", + clnt->cl_program->name, + task->tk_xprt->servername); } } rpc_force_rebind(clnt); @@ -2374,7 +2376,7 @@ call_decode(struct rpc_task *task) if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { - printk(KERN_NOTICE "%s: server %s OK\n", + pr_notice_ratelimited("%s: server %s OK\n", clnt->cl_program->name, task->tk_xprt->servername); } From ae6ec918474597a13a2648c54b6f12fb8cf0a55e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:58 -0400 Subject: [PATCH 15/69] SUNRPC: Add the 'softerr' rpc_client flag Add the 'softerr' rpc client flag that sets the RPC_TASK_TIMEOUT flag on all new rpc tasks that are attached to that rpc client. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 98bc9883b230..943762acfcd4 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -50,6 +50,7 @@ struct rpc_clnt { struct rpc_iostats * cl_metrics; /* per-client statistics */ unsigned int cl_softrtry : 1,/* soft timeouts */ + cl_softerr : 1,/* Timeouts return errors */ cl_discrtry : 1,/* disconnect before retry */ cl_noretranstimeo: 1,/* No retransmit timeouts */ cl_autobind : 1,/* use getport() */ @@ -144,6 +145,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_INFINITE_SLOTS (1UL << 7) #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT (1UL << 8) #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) +#define RPC_CLNT_CREATE_SOFTERR (1UL << 10) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b16322eb6c42..e933f1185317 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -484,8 +484,11 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, } clnt->cl_softrtry = 1; - if (args->flags & RPC_CLNT_CREATE_HARDRTRY) + if (args->flags & (RPC_CLNT_CREATE_HARDRTRY|RPC_CLNT_CREATE_SOFTERR)) { clnt->cl_softrtry = 0; + if (args->flags & RPC_CLNT_CREATE_SOFTERR) + clnt->cl_softerr = 1; + } if (args->flags & RPC_CLNT_CREATE_AUTOBIND) clnt->cl_autobind = 1; @@ -623,6 +626,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, /* Turn off autobind on clones */ new->cl_autobind = 0; new->cl_softrtry = clnt->cl_softrtry; + new->cl_softerr = clnt->cl_softerr; new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; @@ -1001,6 +1005,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) atomic_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; + if (clnt->cl_softerr) + task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; if (atomic_read(&clnt->cl_swapper)) From 11982a7c0f654ededc776bc81d8a0c51d3ee8d1c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:58:59 -0400 Subject: [PATCH 16/69] NFS: Consider ETIMEDOUT to be a fatal error When we introduce the 'softerr' mount option, we will see the RPC layer returning ETIMEDOUT errors if the server is unresponsive. We want to consider those errors to be fatal on par with the EIO errors that are returned by ordinary 'soft' timeouts.. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/internal.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c7cf23ae6597..3cefd0ed01be 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -766,6 +766,7 @@ static inline bool nfs_error_is_fatal(int err) case -ESTALE: case -E2BIG: case -ENOMEM: + case -ETIMEDOUT: return true; default: return false; From 7b1355b615c67e8fcbfb508e6baee83aed9dee96 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:00 -0400 Subject: [PATCH 17/69] NFS: Move internal constants out of uapi/linux/nfs_mount.h When the label says "for internal use only", then it doesn't belong in the 'uapi' subtree. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 9 +++++++++ include/uapi/linux/nfs_mount.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c827d31298cc..013ac5b54a09 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -139,6 +139,15 @@ struct nfs_server { struct nfs_iostats __percpu *io_stats; /* I/O statistics */ atomic_long_t writeback; /* number of writeback pages */ int flags; /* various flags */ + +/* The following are for internal use only. Also see uapi/linux/nfs_mount.h */ +#define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 +#define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000 +#define NFS_MOUNT_NORESVPORT 0x40000 +#define NFS_MOUNT_LEGACY_INTERFACE 0x80000 +#define NFS_MOUNT_LOCAL_FLOCK 0x100000 +#define NFS_MOUNT_LOCAL_FCNTL 0x200000 + unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h index e44e00616ab5..e3bcfc6aa3b0 100644 --- a/include/uapi/linux/nfs_mount.h +++ b/include/uapi/linux/nfs_mount.h @@ -66,13 +66,4 @@ struct nfs_mount_data { #define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ #define NFS_MOUNT_FLAGMASK 0xFFFF -/* The following are for internal use only */ -#define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 -#define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000 -#define NFS_MOUNT_NORESVPORT 0x40000 -#define NFS_MOUNT_LEGACY_INTERFACE 0x80000 - -#define NFS_MOUNT_LOCAL_FLOCK 0x100000 -#define NFS_MOUNT_LOCAL_FCNTL 0x200000 - #endif From 91a575e1a98451d12df713f267a9a210a9e5dcf9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:01 -0400 Subject: [PATCH 18/69] NFS: Add a mount option "softerr" to allow clients to see ETIMEDOUT errors Add a mount option that exposes the ETIMEDOUT errors that occur during soft timeouts to the application. This allows aware applications to distinguish between server disk IO errors and client timeout errors. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 2 ++ fs/nfs/super.c | 15 ++++++++++++--- include/linux/nfs_fs_sb.h | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 90d71fda65ce..f74638c5e5b4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -598,6 +598,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; + if (server->flags & NFS_MOUNT_SOFTERR) + server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c27ac96a95bd..19783e8ba9fb 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -78,7 +78,7 @@ enum { /* Mount options that take no arguments */ - Opt_soft, Opt_hard, + Opt_soft, Opt_softerr, Opt_hard, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, Opt_noac, @@ -125,6 +125,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_sloppy, "sloppy" }, { Opt_soft, "soft" }, + { Opt_softerr, "softerr" }, { Opt_hard, "hard" }, { Opt_deprecated, "intr" }, { Opt_deprecated, "nointr" }, @@ -628,7 +629,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const char *str; const char *nostr; } nfs_info[] = { - { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_SOFT, ",soft", "" }, + { NFS_MOUNT_SOFTERR, ",softerr", "" }, { NFS_MOUNT_POSIX, ",posix", "" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, @@ -658,6 +660,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); + if (!(nfss->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))) + seq_puts(m, ",hard"); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) seq_puts(m, nfs_infop->str); @@ -1239,9 +1243,14 @@ static int nfs_parse_mount_options(char *raw, */ case Opt_soft: mnt->flags |= NFS_MOUNT_SOFT; + mnt->flags &= ~NFS_MOUNT_SOFTERR; + break; + case Opt_softerr: + mnt->flags |= NFS_MOUNT_SOFTERR; + mnt->flags &= ~NFS_MOUNT_SOFT; break; case Opt_hard: - mnt->flags &= ~NFS_MOUNT_SOFT; + mnt->flags &= ~(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR); break; case Opt_posix: mnt->flags |= NFS_MOUNT_POSIX; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 013ac5b54a09..0fbc5d3c5e53 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -147,6 +147,7 @@ struct nfs_server { #define NFS_MOUNT_LEGACY_INTERFACE 0x80000 #define NFS_MOUNT_LOCAL_FLOCK 0x100000 #define NFS_MOUNT_LOCAL_FCNTL 0x200000 +#define NFS_MOUNT_SOFTERR 0x400000 unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ From 14bebe3c90b326d2a0df78aed5e9de090c71d878 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:02 -0400 Subject: [PATCH 19/69] NFS: Don't interrupt file writeout due to fatal errors When flushing out dirty pages, the fact that we may hit fatal errors is not a reason to stop writeback. Those errors are reported through fsync(), not through the flush mechanism. Fixes: a6598813a4c5b ("NFS: Don't write back further requests if there...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f3ebabaa291d..9198a23bb58b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -647,7 +647,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, return ret; out_launder: nfs_write_error_remove_page(req); - return ret; + return 0; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, From 22876f540bdf19af9e4fca893ce02ba7ee65ebcc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:03 -0400 Subject: [PATCH 20/69] NFS: Don't call generic_error_remove_page() while holding locks The NFS read code can trigger writeback while holding the page lock. If an error then triggers a call to nfs_write_error_remove_page(), we can deadlock. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9198a23bb58b..64cf6a340ba6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -584,9 +584,8 @@ nfs_lock_and_join_requests(struct page *page) static void nfs_write_error_remove_page(struct nfs_page *req) { + SetPageError(req->wb_page); nfs_end_page_writeback(req); - generic_error_remove_page(page_file_mapping(req->wb_page), - req->wb_page); nfs_release_request(req); } From aded8d7b54f250af6deb72fde475291cfba513d1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:04 -0400 Subject: [PATCH 21/69] NFS: Don't inadvertently clear writeback errors vfs_fsync() has the side effect of clearing unreported writeback errors, so we need to make sure that we do not abuse it in situations where applications might not normally expect us to report those errors. The solution is to replace calls to vfs_fsync() with calls to nfs_wb_all(). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 6 +++--- fs/nfs/nfs4file.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4899b85f9b3c..f807e8643ae6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -147,7 +147,7 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return vfs_fsync(file, 0); + return nfs_wb_all(inode); } ssize_t @@ -655,7 +655,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* Return error values */ if (nfs_need_check_write(file, inode)) { - int err = vfs_fsync(file, 0); + int err = nfs_wb_all(inode); if (err < 0) result = err; } @@ -709,7 +709,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - vfs_fsync(filp, 0); + nfs_wb_all(inode); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 00d17198ee12..d2846f164997 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -125,7 +125,7 @@ nfs4_file_flush(struct file *file, fl_owner_t id) return filemap_fdatawrite(file->f_mapping); /* Flush writes to the server and return any errors */ - return vfs_fsync(file, 0); + return nfs_wb_all(inode); } #ifdef CONFIG_NFS_V4_2 From 6fbda89b257f25694bf4892ddbbaa472f581533b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:05 -0400 Subject: [PATCH 22/69] NFS: Replace custom error reporting mechanism with generic one Replace the NFS custom error reporting mechanism with the generic mapping_set_error(). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 25 ++++--------------------- fs/nfs/internal.h | 6 ------ fs/nfs/write.c | 40 ++++++++++++++++++++++------------------ include/linux/nfs_fs.h | 1 - 4 files changed, 26 insertions(+), 46 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f807e8643ae6..144e183250c3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -199,13 +199,6 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap); * Flush any dirty pages for this process, and check for write errors. * The return status from this call provides a reliable indication of * whether any write errors occurred for this process. - * - * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to - * disk, but it retrieves and clears ctx->error after synching, despite - * the two being set at the same time in nfs_context_set_write_error(). - * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occurred, and hence cause it to - * fall back to doing a synchronous write. */ static int nfs_file_fsync_commit(struct file *file, int datasync) @@ -220,11 +213,8 @@ nfs_file_fsync_commit(struct file *file, int datasync) nfs_inc_stats(inode, NFSIOS_VFSFSYNC); do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); - if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) { - ret = xchg(&ctx->error, 0); - if (ret) - goto out; - } + if (status == 0) + status = file_check_and_advance_wb_err(file); if (status < 0) { ret = status; goto out; @@ -245,13 +235,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); do { - struct nfs_open_context *ctx = nfs_file_open_context(file); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) { - int ret2 = xchg(&ctx->error, 0); - if (ret2) - ret = ret2; - } + ret = file_write_and_wait_range(file, start, end); if (ret != 0) break; ret = nfs_file_fsync_commit(file, datasync); @@ -600,8 +584,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) struct nfs_open_context *ctx; ctx = nfs_file_open_context(filp); - if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx, inode)) + if (nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3cefd0ed01be..196534634c3a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -773,9 +773,3 @@ static inline bool nfs_error_is_fatal(int err) } } -static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) -{ - ctx->error = error; - smp_wmb(); - set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); -} diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 64cf6a340ba6..03cde38ecd31 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -244,6 +244,12 @@ static void nfs_set_pageerror(struct address_space *mapping) nfs_zap_mapping(mapping->host, mapping); } +static void nfs_mapping_set_error(struct page *page, int error) +{ + SetPageError(page); + mapping_set_error(page_file_mapping(page), error); +} + /* * nfs_page_group_search_locked * @head - head request of page group @@ -582,9 +588,9 @@ nfs_lock_and_join_requests(struct page *page) return ERR_PTR(ret); } -static void nfs_write_error_remove_page(struct nfs_page *req) +static void nfs_write_error(struct nfs_page *req, int error) { - SetPageError(req->wb_page); + nfs_mapping_set_error(req->wb_page, error); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -608,6 +614,7 @@ nfs_error_is_fatal_on_server(int err) static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct page *page) { + struct address_space *mapping; struct nfs_page *req; int ret = 0; @@ -621,19 +628,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = req->wb_context->error; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(ret)) + ret = 0; + mapping = page_file_mapping(page); + if (test_bit(AS_ENOSPC, &mapping->flags) || + test_bit(AS_EIO, &mapping->flags)) goto out_launder; - ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* * Remove the problematic req upon fatal errors on the server */ if (nfs_error_is_fatal(ret)) { - nfs_context_set_write_error(req->wb_context, ret); if (nfs_error_is_fatal_on_server(ret)) goto out_launder; } else @@ -645,7 +652,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, out: return ret; out_launder: - nfs_write_error_remove_page(req); + nfs_write_error(req, ret); return 0; } @@ -998,7 +1005,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { nfs_set_pageerror(page_file_mapping(req->wb_page)); - nfs_context_set_write_error(req->wb_context, hdr->error); + nfs_mapping_set_error(req->wb_page, hdr->error); goto remove_req; } if (nfs_write_need_commit(hdr)) { @@ -1422,14 +1429,10 @@ static void nfs_async_write_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - if (nfs_error_is_fatal(error)) { - nfs_context_set_write_error(req->wb_context, error); - if (nfs_error_is_fatal_on_server(error)) { - nfs_write_error_remove_page(req); - continue; - } - } - nfs_redirty_request(req); + if (nfs_error_is_fatal(error)) + nfs_write_error(req, error); + else + nfs_redirty_request(req); } } @@ -1843,9 +1846,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) req->wb_bytes, (long long)req_offset(req)); if (status < 0) { - nfs_context_set_write_error(req->wb_context, status); - if (req->wb_page) + if (req->wb_page) { + nfs_mapping_set_error(req->wb_page, status); nfs_inode_remove_request(req); + } dprintk_cont(", error = %d\n", status); goto next; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 40e30376130b..d363d5765cdf 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -76,7 +76,6 @@ struct nfs_open_context { fmode_t mode; unsigned long flags; -#define NFS_CONTEXT_ERROR_WRITE (0) #define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) From c917cfaf9bbef44dd35b75b8fb772a44798a1cf2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:06 -0400 Subject: [PATCH 23/69] NFS: Fix up NFS I/O subrequest creation We require all NFS I/O subrequests to duplicate the lock context as well as the open context. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pagelist.c | 105 +++++++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e9f39fa5964b..66a5c5d4a777 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -295,6 +295,42 @@ nfs_page_group_destroy(struct kref *kref) nfs_release_request(head); } +static struct nfs_page * +__nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, + struct nfs_page *last, unsigned int pgbase, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + struct nfs_open_context *ctx = l_ctx->open_context; + + if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) + return ERR_PTR(-EBADF); + /* try to allocate the request struct */ + req = nfs_page_alloc(); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + req->wb_lock_context = l_ctx; + refcount_inc(&l_ctx->count); + atomic_inc(&l_ctx->io_count); + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in + * update_nfs_request below if the region is not locked. */ + req->wb_page = page; + if (page) { + req->wb_index = page_index(page); + get_page(page); + } + req->wb_offset = offset; + req->wb_pgbase = pgbase; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); + kref_init(&req->wb_kref); + nfs_page_group_init(req, last); + return req; +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use @@ -312,40 +348,30 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, struct nfs_page *last, unsigned int offset, unsigned int count) { - struct nfs_page *req; - struct nfs_lock_context *l_ctx; + struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); + struct nfs_page *ret; - if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) - return ERR_PTR(-EBADF); - /* try to allocate the request struct */ - req = nfs_page_alloc(); - if (req == NULL) - return ERR_PTR(-ENOMEM); - - /* get lock context early so we can deal with alloc failures */ - l_ctx = nfs_get_lock_context(ctx); - if (IS_ERR(l_ctx)) { - nfs_page_free(req); + if (IS_ERR(l_ctx)) return ERR_CAST(l_ctx); - } - req->wb_lock_context = l_ctx; - atomic_inc(&l_ctx->io_count); + ret = __nfs_create_request(l_ctx, page, last, offset, offset, count); + nfs_put_lock_context(l_ctx); + return ret; +} - /* Initialize the request struct. Initially, we assume a - * long write-back delay. This will be adjusted in - * update_nfs_request below if the region is not locked. */ - req->wb_page = page; - if (page) { - req->wb_index = page_index(page); - get_page(page); +static struct nfs_page * +nfs_create_subreq(struct nfs_page *req, struct nfs_page *last, + unsigned int pgbase, unsigned int offset, + unsigned int count) +{ + struct nfs_page *ret; + + ret = __nfs_create_request(req->wb_lock_context, req->wb_page, last, + pgbase, offset, count); + if (!IS_ERR(ret)) { + nfs_lock_request(ret); + ret->wb_index = req->wb_index; } - req->wb_offset = offset; - req->wb_pgbase = offset; - req->wb_bytes = count; - req->wb_context = get_nfs_open_context(ctx); - kref_init(&req->wb_kref); - nfs_page_group_init(req, last); - return req; + return ret; } /** @@ -1049,14 +1075,10 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, pgbase += subreq->wb_bytes; if (bytes_left) { - subreq = nfs_create_request(req->wb_context, - req->wb_page, - subreq, pgbase, bytes_left); + subreq = nfs_create_subreq(req, subreq, pgbase, + offset, bytes_left); if (IS_ERR(subreq)) goto err_ptr; - nfs_lock_request(subreq); - subreq->wb_offset = offset; - subreq->wb_index = req->wb_index; } } while (bytes_left > 0); @@ -1158,19 +1180,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, lastreq = lastreq->wb_this_page) ; - dupreq = nfs_create_request(req->wb_context, - req->wb_page, lastreq, pgbase, bytes); + dupreq = nfs_create_subreq(req, lastreq, + pgbase, offset, bytes); + nfs_page_group_unlock(req); if (IS_ERR(dupreq)) { - nfs_page_group_unlock(req); desc->pg_error = PTR_ERR(dupreq); goto out_failed; } - - nfs_lock_request(dupreq); - nfs_page_group_unlock(req); - dupreq->wb_offset = offset; - dupreq->wb_index = req->wb_index; } else dupreq = req; From 28b1d3f5a772b705ca76df620eb9f686aa2d0b4c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:07 -0400 Subject: [PATCH 24/69] NFS: Remove unused argument from nfs_create_request() All the callers of nfs_create_request() are now creating page group heads, so we can remove the redundant 'last' page argument. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 4 ++-- fs/nfs/pagelist.c | 16 ++++++++-------- fs/nfs/read.c | 4 ++-- fs/nfs/write.c | 2 +- include/linux/nfs_page.h | 1 - 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0fd811ac08b5..2d301a1a73e2 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -492,7 +492,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + req = nfs_create_request(dreq->ctx, pagevec[i], pgbase, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -899,7 +899,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + req = nfs_create_request(dreq->ctx, pagevec[i], pgbase, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 66a5c5d4a777..b8301c40dd78 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -297,8 +297,8 @@ nfs_page_group_destroy(struct kref *kref) static struct nfs_page * __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, - struct nfs_page *last, unsigned int pgbase, - unsigned int offset, unsigned int count) + unsigned int pgbase, unsigned int offset, + unsigned int count) { struct nfs_page *req; struct nfs_open_context *ctx = l_ctx->open_context; @@ -327,7 +327,6 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); - nfs_page_group_init(req, last); return req; } @@ -335,7 +334,6 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use * @page: page to write - * @last: last nfs request created for this page group or NULL if head * @offset: starting offset within the page for the write * @count: number of bytes to read/write * @@ -345,15 +343,16 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, */ struct nfs_page * nfs_create_request(struct nfs_open_context *ctx, struct page *page, - struct nfs_page *last, unsigned int offset, - unsigned int count) + unsigned int offset, unsigned int count) { struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); struct nfs_page *ret; if (IS_ERR(l_ctx)) return ERR_CAST(l_ctx); - ret = __nfs_create_request(l_ctx, page, last, offset, offset, count); + ret = __nfs_create_request(l_ctx, page, offset, offset, count); + if (!IS_ERR(ret)) + nfs_page_group_init(ret, NULL); nfs_put_lock_context(l_ctx); return ret; } @@ -365,11 +364,12 @@ nfs_create_subreq(struct nfs_page *req, struct nfs_page *last, { struct nfs_page *ret; - ret = __nfs_create_request(req->wb_lock_context, req->wb_page, last, + ret = __nfs_create_request(req->wb_lock_context, req->wb_page, pgbase, offset, count); if (!IS_ERR(ret)) { nfs_lock_request(ret); ret->wb_index = req->wb_index; + nfs_page_group_init(ret, last); } return ret; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 1d95a60b2586..fad1333dbf71 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -118,7 +118,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, page, NULL, 0, len); + new = nfs_create_request(ctx, page, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -363,7 +363,7 @@ readpage_async_filler(void *data, struct page *page) if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, page, NULL, 0, len); + new = nfs_create_request(desc->ctx, page, 0, len); if (IS_ERR(new)) goto out_error; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 03cde38ecd31..b9bcbd06a628 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1171,7 +1171,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, req = nfs_try_to_update_request(inode, page, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, page, NULL, offset, bytes); + req = nfs_create_request(ctx, page, offset, bytes); if (IS_ERR(req)) goto out; nfs_inode_add_request(inode, req); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index ad69430fd0eb..b7d0f15615c2 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -114,7 +114,6 @@ struct nfs_pageio_descriptor { extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx, struct page *page, - struct nfs_page *last, unsigned int offset, unsigned int count); extern void nfs_release_request(struct nfs_page *); From 33344e0f7eaa2efbf9fcc55557d02e8603aa7012 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:08 -0400 Subject: [PATCH 25/69] pNFS: Add tracking to limit the number of pNFS retries When the client is reading or writing using pNFS, and hits an error on the DS, then it typically sends a LAYOUTERROR and/or LAYOUTRETURN to the MDS, before redirtying the failed pages, and going for a new round of reads/writebacks. The problem is that if the server has no way to fix the DS, then we may need a way to interrupt this loop after a set number of attempts have been made. This patch adds an optional module parameter that allows the admin to specify how many times to retry the read/writeback process before failing with a fatal error. The default behaviour is to retry forever. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 7 +++++++ fs/nfs/flexfilelayout/flexfilelayout.c | 8 ++++++++ fs/nfs/pagelist.c | 14 +++++++++++++- fs/nfs/write.c | 5 +++++ include/linux/nfs_page.h | 4 +++- 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2d301a1a73e2..2436bd92bc00 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -663,6 +663,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) } list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + /* Bump the transmission count */ + req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { nfs_list_move_request(req, &failed); spin_lock(&cinfo.inode->i_lock); @@ -703,6 +705,11 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { + /* + * Despite the reboot, the write was successful, + * so reset wb_nio. + */ + req->wb_nio = 0; /* Note the rewrite will go through mds */ nfs_mark_request_commit(req, NULL, &cinfo, 0); } else diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 6673d4ff5a2a..9fdbcfd3e39d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -28,6 +28,8 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) #define FF_LAYOUTRETURN_MAXERR 20 +static unsigned short io_maxretrans; + static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, @@ -925,6 +927,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, pgm = &pgio->pg_mirrors[0]; pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + pgio->pg_maxretrans = io_maxretrans; return; out_nolseg: if (pgio->pg_error < 0) @@ -992,6 +995,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; } + pgio->pg_maxretrans = io_maxretrans; return; out_mds: @@ -2515,3 +2519,7 @@ MODULE_DESCRIPTION("The NFSv4 flexfile layout driver"); module_init(nfs4flexfilelayout_init); module_exit(nfs4flexfilelayout_exit); + +module_param(io_maxretrans, ushort, 0644); +MODULE_PARM_DESC(io_maxretrans, "The number of times the NFSv4.1 client " + "retries an I/O request before returning an error. "); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b8301c40dd78..4a31284f411e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -16,8 +16,8 @@ #include #include #include -#include #include +#include #include #include @@ -327,6 +327,7 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); + req->wb_nio = 0; return req; } @@ -370,6 +371,7 @@ nfs_create_subreq(struct nfs_page *req, struct nfs_page *last, nfs_lock_request(ret); ret->wb_index = req->wb_index; nfs_page_group_init(ret, last); + ret->wb_nio = req->wb_nio; } return ret; } @@ -724,6 +726,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_mirrors_dynamic = NULL; desc->pg_mirrors = desc->pg_mirrors_static; nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); + desc->pg_maxretrans = 0; } /** @@ -983,6 +986,15 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, return 0; mirror->pg_base = req->wb_pgbase; } + + if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) { + if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR) + desc->pg_error = -ETIMEDOUT; + else + desc->pg_error = -EIO; + return 0; + } + if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; nfs_list_move_request(req, &mirror->pg_list); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b9bcbd06a628..294604784f70 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1009,6 +1009,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto remove_req; } if (nfs_write_need_commit(hdr)) { + /* Reset wb_nio, since the write was successful. */ + req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->pgio_mirror_idx); @@ -1142,6 +1144,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = end - req->wb_offset; else req->wb_bytes = rqend - req->wb_offset; + req->wb_nio = 0; return req; out_flushme: /* @@ -1416,6 +1419,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, */ static void nfs_redirty_request(struct nfs_page *req) { + /* Bump the transmission count */ + req->wb_nio++; nfs_mark_request_dirty(req); set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); nfs_end_page_writeback(req); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index b7d0f15615c2..8b36800d342d 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -53,6 +53,7 @@ struct nfs_page { struct nfs_write_verifier wb_verf; /* Commit cookie */ struct nfs_page *wb_this_page; /* list of reqs for this page */ struct nfs_page *wb_head; /* head pointer for req list */ + unsigned short wb_nio; /* Number of I/O attempts */ }; struct nfs_pageio_descriptor; @@ -87,7 +88,6 @@ struct nfs_pgio_mirror { }; struct nfs_pageio_descriptor { - unsigned char pg_moreio : 1; struct inode *pg_inode; const struct nfs_pageio_ops *pg_ops; const struct nfs_rw_ops *pg_rw_ops; @@ -105,6 +105,8 @@ struct nfs_pageio_descriptor { struct nfs_pgio_mirror pg_mirrors_static[1]; struct nfs_pgio_mirror *pg_mirrors_dynamic; u32 pg_mirror_idx; /* current mirror */ + unsigned short pg_maxretrans; + unsigned char pg_moreio : 1; }; /* arbitrarily selected limit to number of mirrors */ From 0688e64bc60038971253485d92bc3d6c816f915d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:09 -0400 Subject: [PATCH 26/69] NFS: Allow signal interruption of NFS4ERR_DELAYed operations If the server is unable to immediately execute an RPC call, and returns an NFS4ERR_DELAY then we can assume it is safe to interrupt the operation in order to handle ordinary signals. This allows the application to service timer interrupts that would otherwise have to wait until the server is again able to respond. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 139 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 106 insertions(+), 34 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 06ac3d9ac7c6..8a38a254f516 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -206,6 +206,7 @@ struct nfs4_exception { unsigned char delay : 1, recovering : 1, retry : 1; + bool interruptible; }; struct nfs4_state_recovery_ops { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 96b988689f0e..c29cbef6b53f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -400,17 +400,32 @@ static long nfs4_update_delay(long *timeout) return ret; } -static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +static int nfs4_delay_killable(long *timeout) { - int res = 0; - might_sleep(); freezable_schedule_timeout_killable_unsafe( nfs4_update_delay(timeout)); - if (fatal_signal_pending(current)) - res = -ERESTARTSYS; - return res; + if (!__fatal_signal_pending(current)) + return 0; + return -EINTR; +} + +static int nfs4_delay_interruptible(long *timeout) +{ + might_sleep(); + + freezable_schedule_timeout_interruptible(nfs4_update_delay(timeout)); + if (!signal_pending(current)) + return 0; + return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; +} + +static int nfs4_delay(long *timeout, bool interruptible) +{ + if (interruptible) + return nfs4_delay_interruptible(timeout); + return nfs4_delay_killable(timeout); } /* This is the error handling routine for processes that are allowed @@ -546,7 +561,8 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { - ret = nfs4_delay(server->client, &exception->timeout); + ret = nfs4_delay(&exception->timeout, + exception->interruptible); goto out_retry; } if (exception->recovering) { @@ -3066,7 +3082,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, int *opened) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct nfs4_state *res; struct nfs4_open_createattrs c = { .label = label, @@ -3679,7 +3697,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = nfs4_handle_exception(server, @@ -3721,7 +3741,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_lookup_root(server, fhandle, info); @@ -3948,7 +3970,9 @@ static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode); @@ -4071,7 +4095,9 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct rpc_clnt *client = *clnt; int err; do { @@ -4175,7 +4201,9 @@ static int _nfs4_proc_lookupp(struct inode *inode, static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_lookupp(inode, fhandle, fattr, label); @@ -4222,7 +4250,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_access(inode, entry); @@ -4277,7 +4307,9 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page, static int nfs4_proc_readlink(struct inode *inode, struct page *page, unsigned int pgbase, unsigned int pglen) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_readlink(inode, page, pgbase, pglen); @@ -4353,7 +4385,9 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype) static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct inode *inode = d_inode(dentry); int err; @@ -4374,7 +4408,9 @@ static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry) static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { @@ -4533,7 +4569,9 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = nfs4_handle_exception(NFS_SERVER(inode), @@ -4640,7 +4678,9 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct nfs4_label l, *label = NULL; int err; @@ -4679,7 +4719,9 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct nfs4_label l, *label = NULL; int err; @@ -4739,7 +4781,9 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, u64 cookie, struct page **pages, unsigned int count, bool plus) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_readdir(dentry, cred, cookie, @@ -4790,7 +4834,9 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct nfs4_label l, *label = NULL; int err; @@ -4832,7 +4878,9 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = nfs4_handle_exception(server, @@ -4863,7 +4911,9 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; unsigned long now = jiffies; int err; @@ -4925,7 +4975,9 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *pathconf) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { @@ -5494,7 +5546,9 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; ssize_t ret; do { ret = __nfs4_get_acl_uncached(inode, buf, buflen); @@ -5628,7 +5682,9 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, static int nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -6269,7 +6325,9 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { @@ -6833,6 +6891,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * struct nfs4_exception exception = { .state = state, .inode = state->inode, + .interruptible = true, }; int err; @@ -7246,7 +7305,9 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct nfs4_fs_locations *fs_locations, struct page *page) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_fs_locations(client, dir, name, @@ -7389,7 +7450,9 @@ int nfs4_proc_get_locations(struct inode *inode, struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int status; dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, @@ -7513,7 +7576,9 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int status; dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, @@ -7579,7 +7644,9 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = -NFS4ERR_WRONGSEC; @@ -9269,7 +9336,9 @@ static int nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { /* first try using integrity protection */ @@ -9436,7 +9505,9 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid, const struct cred *cred) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs41_test_stateid(server, stateid, cred); From 154945112dac10b7109d816275f3e4896b0b064e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:10 -0400 Subject: [PATCH 27/69] NFS: Ensure that all nfs lock contexts have a valid open context Force the lock context to keep a reference to the parent open context so that we can guarantee the validity of the latter. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 414a90d48493..efc45f22c581 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -885,10 +885,14 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { - list_add_tail_rcu(&new->list, &ctx->lock_context.list); - new->open_context = ctx; - res = new; - new = NULL; + new->open_context = get_nfs_open_context(ctx); + if (new->open_context) { + list_add_tail_rcu(&new->list, + &ctx->lock_context.list); + res = new; + new = NULL; + } else + res = ERR_PTR(-EBADF); } spin_unlock(&inode->i_lock); kfree(new); @@ -906,6 +910,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) return; list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); + put_nfs_open_context(ctx); kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); From 9fcd5960e88bbdc74a70d9e3a5ab46b489fc4b80 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:11 -0400 Subject: [PATCH 28/69] NFS: Add a helper to return a pointer to the open context of a struct nfs_page Add a helper for when we remove the explicit pointer to the open context. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 4 ++-- fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++--- fs/nfs/pagelist.c | 8 ++++---- fs/nfs/pnfs.c | 4 ++-- fs/nfs/pnfs.h | 4 ++-- fs/nfs/read.c | 2 +- fs/nfs/write.c | 20 +++++++++++--------- include/linux/nfs_page.h | 6 ++++++ 8 files changed, 31 insertions(+), 23 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 61f46facb39c..21d9f3bfbc81 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -917,7 +917,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), 0, NFS4_MAX_UINT64, IOMODE_READ, @@ -944,7 +944,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), 0, NFS4_MAX_UINT64, IOMODE_RW, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 9fdbcfd3e39d..9920c52bd0cd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -873,7 +873,7 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, { pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), 0, NFS4_MAX_UINT64, IOMODE_READ, @@ -953,7 +953,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), 0, NFS4_MAX_UINT64, IOMODE_RW, @@ -1010,7 +1010,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, { if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), 0, NFS4_MAX_UINT64, IOMODE_RW, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 4a31284f411e..ce6440b79328 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -47,7 +47,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, hdr->req = nfs_list_entry(mirror->pg_list.next); hdr->inode = desc->pg_inode; - hdr->cred = hdr->req->wb_context->cred; + hdr->cred = nfs_req_openctx(hdr->req)->cred; hdr->io_start = req_offset(hdr->req); hdr->good_bytes = mirror->pg_count; hdr->io_completion = desc->pg_io_completion; @@ -578,7 +578,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, hdr->args.pgbase = req->wb_pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; - hdr->args.context = get_nfs_open_context(req->wb_context); + hdr->args.context = get_nfs_open_context(nfs_req_openctx(req)); hdr->args.lock_context = req->wb_lock_context; hdr->args.stable = NFS_UNSTABLE; switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { @@ -935,9 +935,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct file_lock_context *flctx; if (prev) { - if (!nfs_match_open_context(req->wb_context, prev->wb_context)) + if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) return false; - flctx = d_inode(req->wb_context->dentry)->i_flctx; + flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx; if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7066cd7c7aff..83722e936b4a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2436,7 +2436,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), req_offset(req), rd_size, IOMODE_READ, @@ -2463,7 +2463,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, + nfs_req_openctx(req), req_offset(req), wb_size, IOMODE_RW, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index c0420b979d88..f15609c003d8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -459,7 +459,7 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (lseg == NULL || ld->mark_request_commit == NULL) @@ -471,7 +471,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (ld == NULL || ld->clear_request_commit == NULL) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index fad1333dbf71..c799e540ed1e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); static void nfs_readpage_release(struct nfs_page *req) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 294604784f70..bc5bb9323412 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -964,7 +964,8 @@ static void nfs_clear_request_commit(struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct nfs_open_context *ctx = nfs_req_openctx(req); + struct inode *inode = d_inode(ctx->dentry); struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); @@ -1219,7 +1220,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) return 0; l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || - !nfs_match_open_context(req->wb_context, ctx); + !nfs_match_open_context(nfs_req_openctx(req), ctx); if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { @@ -1422,7 +1423,7 @@ static void nfs_redirty_request(struct nfs_page *req) /* Bump the transmission count */ req->wb_nio++; nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); + set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1742,7 +1743,8 @@ void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo) { struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = d_inode(first->wb_context->dentry); + struct nfs_open_context *ctx = nfs_req_openctx(first); + struct inode *inode = d_inode(ctx->dentry); /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1750,7 +1752,7 @@ void nfs_init_commit(struct nfs_commit_data *data, list_splice_init(head, &data->pages); data->inode = inode; - data->cred = first->wb_context->cred; + data->cred = ctx->cred; data->lseg = lseg; /* reference transferred */ /* only set lwb for pnfs commit */ if (lseg) @@ -1763,7 +1765,7 @@ void nfs_init_commit(struct nfs_commit_data *data, /* Note: we always request a commit of the entire inode */ data->args.offset = 0; data->args.count = 0; - data->context = get_nfs_open_context(first->wb_context); + data->context = get_nfs_open_context(ctx); data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); @@ -1846,8 +1848,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_clear_page_commit(req->wb_page); dprintk("NFS: commit (%s/%llu %d@%lld)", - req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), + nfs_req_openctx(req)->dentry->d_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(nfs_req_openctx(req)->dentry)), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { @@ -1871,7 +1873,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk_cont(" mismatch\n"); nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); + set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags); next: nfs_unlock_and_release_request(req); /* Latency breaker */ diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 8b36800d342d..1ea13e94feb7 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -200,4 +200,10 @@ loff_t req_offset(struct nfs_page *req) return (((loff_t)req->wb_index) << PAGE_SHIFT) + req->wb_offset; } +static inline struct nfs_open_context * +nfs_req_openctx(struct nfs_page *req) +{ + return req->wb_context; +} + #endif /* _LINUX_NFS_PAGE_H */ From c79d183ebb76311ed434bd558279769551d02d5a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 7 Apr 2019 13:59:12 -0400 Subject: [PATCH 29/69] NFS: Remove redundant open context from nfs_page The lock context already references and tracks the open context, so take the opportunity to save some space in struct nfs_page. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pagelist.c | 8 ++------ include/linux/nfs_page.h | 3 +-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ce6440b79328..6ec30014a439 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -325,7 +325,6 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, req->wb_offset = offset; req->wb_pgbase = pgbase; req->wb_bytes = count; - req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); req->wb_nio = 0; return req; @@ -414,8 +413,8 @@ void nfs_unlock_and_release_request(struct nfs_page *req) static void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; - struct nfs_open_context *ctx = req->wb_context; struct nfs_lock_context *l_ctx = req->wb_lock_context; + struct nfs_open_context *ctx; if (page != NULL) { put_page(page); @@ -424,16 +423,13 @@ static void nfs_clear_request(struct nfs_page *req) if (l_ctx != NULL) { if (atomic_dec_and_test(&l_ctx->io_count)) { wake_up_var(&l_ctx->io_count); + ctx = l_ctx->open_context; if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); } nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } - if (ctx != NULL) { - put_nfs_open_context(ctx); - req->wb_context = NULL; - } } /** diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 1ea13e94feb7..0bbd587fac6a 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -42,7 +42,6 @@ struct nfs_inode; struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ - struct nfs_open_context *wb_context; /* File state context info */ struct nfs_lock_context *wb_lock_context; /* lock context info */ pgoff_t wb_index; /* Offset >> PAGE_SHIFT */ unsigned int wb_offset, /* Offset & ~PAGE_MASK */ @@ -203,7 +202,7 @@ loff_t req_offset(struct nfs_page *req) static inline struct nfs_open_context * nfs_req_openctx(struct nfs_page *req) { - return req->wb_context; + return req->wb_lock_context->open_context; } #endif /* _LINUX_NFS_PAGE_H */ From ce96e888fe48ecfa868c9a39adc03292c78a80ff Mon Sep 17 00:00:00 2001 From: Xiaoli Feng Date: Sat, 16 Mar 2019 09:43:30 +0800 Subject: [PATCH 30/69] Fix nfs4.2 return -EINVAL when do dedupe operation dedupe_file_range operations is combiled into remap_file_range. But in nfs42_remap_file_range, it's skiped for dedupe operations. Before this patch: # dd if=/dev/zero of=nfs/file bs=1M count=1 # xfs_io -c "dedupe nfs/file 4k 64k 4k" nfs/file XFS_IOC_FILE_EXTENT_SAME: Invalid argument After this patch: # dd if=/dev/zero of=nfs/file bs=1M count=1 # xfs_io -c "dedupe nfs/file 4k 64k 4k" nfs/file deduped 4096/4096 bytes at offset 65536 4 KiB, 1 ops; 0.0046 sec (865.988 KiB/sec and 216.4971 ops/sec) Signed-off-by: Xiaoli Feng Signed-off-by: Anna Schumaker --- fs/nfs/nfs4file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d2846f164997..cf42a8b939e3 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -187,7 +187,7 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, bool same_inode = false; int ret; - if (remap_flags & ~REMAP_FILE_ADVISORY) + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; /* check alignment w.r.t. clone_blksize */ From 52db6f9a0cd8f6d433a0687aae4f21f209352510 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:38:55 -0400 Subject: [PATCH 31/69] SUNRPC: Avoid digging into the ATOMIC pool Page allocation requests made when the SPARSE_PAGES flag is set are allowed to fail, and are not critical. No need to spend a rare resource. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/socklib.c | 2 +- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 7e55cfc69697..9faea12624a6 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -106,7 +106,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { - *ppage = alloc_page(GFP_ATOMIC); + *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); if (unlikely(*ppage == NULL)) { if (copied == 0) copied = -ENOMEM; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 6c1fb270f127..b759b169dadf 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -238,7 +238,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, */ if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { if (!*ppages) - *ppages = alloc_page(GFP_ATOMIC); + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); if (!*ppages) return -ENOBUFS; } From b2ca473b920dfbaad7c4f9eb5043258ef71f321f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:00 -0400 Subject: [PATCH 32/69] xprtrdma: Fix an frwr_map recovery nit After a DMA map failure in frwr_map, mark the MR so that recycling won't attempt to DMA unmap it. Signed-off-by: Chuck Lever Fixes: e2f34e26710b ("xprtrdma: Yet another double DMA-unmap") Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 52cb6c1b0c2b..a2a2e01cb5dd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -466,7 +466,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - frwr->fr_state = FRWR_IS_INVALID; + mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); rpcrdma_mr_put(mr); return ERR_PTR(-EIO); From 1769e6a816dff50d960271eb780e0a40b739b256 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:05 -0400 Subject: [PATCH 33/69] xprtrdma: Clean up rpcrdma_create_req() Eventually, I'd like to invoke rpcrdma_create_req() during the call_reserve step. Memory allocation there probably needs to use GFP_NOIO. Therefore a set of GFP flags needs to be passed in. As an additional clean up, just return a pointer or NULL, because the only error return code here is -ENOMEM. Lastly, clean up the function names to be consistent with the pattern: "rpcrdma" _ object-type _ action Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 29 ++++++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 3 ++- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index d79b18c1f4cd..713961a63c49 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -31,9 +31,9 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb; size_t size; - req = rpcrdma_create_req(r_xprt); - if (IS_ERR(req)) - return PTR_ERR(req); + req = rpcrdma_req_create(r_xprt, GFP_KERNEL); + if (!req) + return -ENOMEM; rqst = &req->rl_slot; rqst->rq_xprt = xprt; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 30cfc0efe699..71fc41f9a8eb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -996,22 +996,27 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) rpcrdma_mrs_create(r_xprt); } -struct rpcrdma_req * -rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_req_create - Allocate an rpcrdma_req object + * @r_xprt: controlling r_xprt + * @flags: GFP flags passed to memory allocators + * + * Returns an allocated and fully initialized rpcrdma_req or NULL. + */ +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, gfp_t flags) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), GFP_KERNEL); + req = kzalloc(sizeof(*req), flags); if (req == NULL) - return ERR_PTR(-ENOMEM); + return NULL; - rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, - DMA_TO_DEVICE, GFP_KERNEL); + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); if (IS_ERR(rb)) { kfree(req); - return ERR_PTR(-ENOMEM); + return NULL; } req->rl_rdmabuf = rb; xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); @@ -1086,16 +1091,14 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); + + rc = -ENOMEM; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - req = rpcrdma_create_req(r_xprt); - if (IS_ERR(req)) { - dprintk("RPC: %s: request buffer %d alloc" - " failed\n", __func__, i); - rc = PTR_ERR(req); + req = rpcrdma_req_create(r_xprt, GFP_KERNEL); + if (!req) goto out; - } list_add(&req->rl_list, &buf->rb_send_bufs); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 10f6593e1a6a..539558fc9c62 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -528,7 +528,8 @@ int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + gfp_t flags); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); From 23146500b32fbee7eaa57c5002fcd64e5d9b32ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:11 -0400 Subject: [PATCH 34/69] xprtrdma: Clean up rpcrdma_create_rep() and rpcrdma_destroy_rep() For code legibility, clean up the function names to be consistent with the pattern: "rpcrdma" _ object-type _ action Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 71fc41f9a8eb..caa6a5df12b0 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -76,7 +76,6 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); -static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); @@ -1029,25 +1028,20 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, gfp_t flags) return req; } -static int -rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) +static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - int rc; - rc = -ENOMEM; rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (rep == NULL) goto out; rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, DMA_FROM_DEVICE, GFP_KERNEL); - if (IS_ERR(rep->rr_rdmabuf)) { - rc = PTR_ERR(rep->rr_rdmabuf); + if (IS_ERR(rep->rr_rdmabuf)) goto out_free; - } xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, rdmab_length(rep->rr_rdmabuf)); @@ -1063,12 +1057,12 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) spin_lock(&buf->rb_lock); list_add(&rep->rr_list, &buf->rb_recv_bufs); spin_unlock(&buf->rb_lock); - return 0; + return true; out_free: kfree(rep); out: - return rc; + return false; } int @@ -1124,8 +1118,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) return rc; } -static void -rpcrdma_destroy_rep(struct rpcrdma_rep *rep) +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) { rpcrdma_free_regbuf(rep->rr_rdmabuf); kfree(rep); @@ -1205,7 +1198,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rep = list_first_entry(&buf->rb_recv_bufs, struct rpcrdma_rep, rr_list); list_del(&rep->rr_list); - rpcrdma_destroy_rep(rep); + rpcrdma_rep_destroy(rep); } while (!list_empty(&buf->rb_send_bufs)) { @@ -1334,7 +1327,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) } spin_unlock(&buffers->rb_lock); if (rep) - rpcrdma_destroy_rep(rep); + rpcrdma_rep_destroy(rep); } /* @@ -1351,7 +1344,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) list_add(&rep->rr_list, &buffers->rb_recv_bufs); spin_unlock(&buffers->rb_lock); } else { - rpcrdma_destroy_rep(rep); + rpcrdma_rep_destroy(rep); } } @@ -1500,7 +1493,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) list_del(&rep->rr_list); spin_unlock(&buf->rb_lock); if (!rep) { - if (rpcrdma_create_rep(r_xprt, temp)) + if (!rpcrdma_rep_create(r_xprt, temp)) break; continue; } From 8cec3dba76a4d9d7da4a7219663b8c4333f14522 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:16 -0400 Subject: [PATCH 35/69] xprtrdma: rpcrdma_regbuf alignment Allocate the struct rpcrdma_regbuf separately from the I/O buffer to better guarantee the alignment of the I/O buffer and eliminate the wasted space between the rpcrdma_regbuf metadata and the buffer itself. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 6 +++--- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/transport.c | 8 ++++---- net/sunrpc/xprtrdma/verbs.c | 27 ++++++++++++++++----------- net/sunrpc/xprtrdma/xprt_rdma.h | 19 ++++++++++--------- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 713961a63c49..6170ec7ba504 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -45,10 +45,10 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) + if (!rb) goto out_fail; req->rl_sendbuf = rb; - xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, + xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(rb), min_t(size_t, size, PAGE_SIZE)); } return 0; @@ -123,7 +123,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, - req->rl_rdmabuf->rg_base, rqst); + rdmab_data(req->rl_rdmabuf), rqst); p = xdr_reserve_space(&req->rl_stream, 28); if (unlikely(!p)) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b759b169dadf..cf99c55add1b 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -747,8 +747,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) int ret; rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); - xdr_init_encode(xdr, &req->rl_hdrbuf, - req->rl_rdmabuf->rg_base, rqst); + xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), + rqst); /* Fixed header fields */ ret = -EMSGSIZE; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 7e73abe01cfe..ced9812940f7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -595,7 +595,7 @@ rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return true; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); - if (IS_ERR(rb)) + if (!rb) return false; rpcrdma_free_regbuf(req->rl_sendbuf); @@ -625,7 +625,7 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return true; rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); - if (IS_ERR(rb)) + if (!rb) return false; rpcrdma_free_regbuf(req->rl_recvbuf); @@ -660,8 +660,8 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - rqst->rq_buffer = req->rl_sendbuf->rg_base; - rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + rqst->rq_buffer = rdmab_data(req->rl_sendbuf); + rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf); trace_xprtrdma_op_allocate(task, req); return 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index caa6a5df12b0..f88fd3934f56 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1013,12 +1013,12 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, gfp_t flags) return NULL; rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); - if (IS_ERR(rb)) { + if (!rb) { kfree(req); return NULL; } req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); + xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); req->rl_buffer = buffer; INIT_LIST_HEAD(&req->rl_registered); @@ -1040,9 +1040,9 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, DMA_FROM_DEVICE, GFP_KERNEL); - if (IS_ERR(rep->rr_rdmabuf)) + if (!rep->rr_rdmabuf) goto out_free; - xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1354,8 +1354,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * @direction: direction of data movement * @flags: GFP flags * - * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that - * can be persistently DMA-mapped for I/O. + * Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for * receiving the payload of RDMA RECV operations. During Long Calls @@ -1367,14 +1366,18 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb) + size, flags); - if (rb == NULL) - return ERR_PTR(-ENOMEM); + rb = kmalloc(sizeof(*rb), flags); + if (!rb) + return NULL; + rb->rg_data = kmalloc(size, flags); + if (!rb->rg_data) { + kfree(rb); + return NULL; + } rb->rg_device = NULL; rb->rg_direction = direction; rb->rg_iov.length = size; - return rb; } @@ -1392,7 +1395,7 @@ __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) return false; rb->rg_iov.addr = ib_dma_map_single(device, - (void *)rb->rg_base, + rdmab_data(rb), rdmab_length(rb), rb->rg_direction); if (ib_dma_mapping_error(device, rdmab_addr(rb))) { @@ -1427,6 +1430,8 @@ void rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) { rpcrdma_dma_unmap_regbuf(rb); + if (rb) + kfree(rb->rg_data); kfree(rb); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 539558fc9c62..1af9674572bd 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -121,33 +121,34 @@ struct rpcrdma_regbuf { struct ib_sge rg_iov; struct ib_device *rg_device; enum dma_data_direction rg_direction; - __be32 rg_base[0] __attribute__ ((aligned(256))); + void *rg_data; }; -static inline u64 -rdmab_addr(struct rpcrdma_regbuf *rb) +static inline u64 rdmab_addr(struct rpcrdma_regbuf *rb) { return rb->rg_iov.addr; } -static inline u32 -rdmab_length(struct rpcrdma_regbuf *rb) +static inline u32 rdmab_length(struct rpcrdma_regbuf *rb) { return rb->rg_iov.length; } -static inline u32 -rdmab_lkey(struct rpcrdma_regbuf *rb) +static inline u32 rdmab_lkey(struct rpcrdma_regbuf *rb) { return rb->rg_iov.lkey; } -static inline struct ib_device * -rdmab_device(struct rpcrdma_regbuf *rb) +static inline struct ib_device *rdmab_device(struct rpcrdma_regbuf *rb) { return rb->rg_device; } +static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) +{ + return rb->rg_data; +} + #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) /* To ensure a transport can always make forward progress, From bb93a1ae2bf4f6eb3cedf05a2ea4a2e6a80712e6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:21 -0400 Subject: [PATCH 36/69] xprtrdma: Allocate req's regbufs at xprt create time Allocating an rpcrdma_req's regbufs at xprt create time enables a pair of micro-optimizations: First, if these regbufs are always there, we can eliminate two conditional branches from the hot xprt_rdma_allocate path. Second, by allocating a 1KB buffer, it places a lower bound on the size of these buffers, without adding yet another conditional branch. The lower bound reduces the number of hardway re- allocations. In fact, for some workloads it completely eliminates hardway allocations. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 18 ++++------------ net/sunrpc/xprtrdma/transport.c | 4 ++-- net/sunrpc/xprtrdma/verbs.c | 34 +++++++++++++++++++++++-------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 6170ec7ba504..e1a125ad888d 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -28,10 +28,10 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, unsigned int i; for (i = 0; i < (count << 1); i++) { - struct rpcrdma_regbuf *rb; size_t size; - req = rpcrdma_req_create(r_xprt, GFP_KERNEL); + size = min_t(size_t, r_xprt->rx_data.inline_rsize, PAGE_SIZE); + req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return -ENOMEM; rqst = &req->rl_slot; @@ -42,20 +42,10 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, spin_lock(&xprt->bc_pa_lock); list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock(&xprt->bc_pa_lock); - - size = r_xprt->rx_data.inline_rsize; - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); - if (!rb) - goto out_fail; - req->rl_sendbuf = rb; - xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(rb), - min_t(size_t, size, PAGE_SIZE)); + xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), + size); } return 0; - -out_fail: - rpcrdma_req_destroy(req); - return -ENOMEM; } /** diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ced9812940f7..a5da43f3b035 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -591,7 +591,7 @@ rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct rpcrdma_regbuf *rb; - if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size) + if (likely(rdmab_length(req->rl_sendbuf) >= size)) return true; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); @@ -621,7 +621,7 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct rpcrdma_regbuf *rb; - if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) + if (likely(rdmab_length(req->rl_recvbuf) >= size)) return true; rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f88fd3934f56..77e0f21c9017 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -998,11 +998,13 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) /** * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt + * @size: initial size, in bytes, of send and receive buffers * @flags: GFP flags passed to memory allocators * * Returns an allocated and fully initialized rpcrdma_req or NULL. */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, gfp_t flags) +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, + gfp_t flags) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_regbuf *rb; @@ -1010,22 +1012,37 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, gfp_t flags) req = kzalloc(sizeof(*req), flags); if (req == NULL) - return NULL; + goto out1; rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); - if (!rb) { - kfree(req); - return NULL; - } + if (!rb) + goto out2; req->rl_rdmabuf = rb; xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); + + req->rl_sendbuf = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + if (!req->rl_sendbuf) + goto out3; + + req->rl_recvbuf = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); + if (!req->rl_recvbuf) + goto out4; + req->rl_buffer = buffer; INIT_LIST_HEAD(&req->rl_registered); - spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_lock); return req; + +out4: + kfree(req->rl_sendbuf); +out3: + kfree(req->rl_rdmabuf); +out2: + kfree(req); +out1: + return NULL; } static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) @@ -1090,7 +1107,8 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, + GFP_KERNEL); if (!req) goto out; list_add(&req->rl_list, &buf->rb_send_bufs); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1af9674572bd..03d5ce443bf0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -529,7 +529,7 @@ int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, gfp_t flags); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); From 0f665ceb71a20520bdce76fb63ad68c21841aa62 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:27 -0400 Subject: [PATCH 37/69] xprtrdma: De-duplicate "allocate new, free old regbuf" Clean up by providing an API to do this common task. At this point, the difference between rpcrdma_get_sendbuf and rpcrdma_get_recvbuf has become tiny. These can be collapsed into a single helper. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 59 +++++++-------------------------- net/sunrpc/xprtrdma/verbs.c | 25 ++++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a5da43f3b035..8cf4fa36ed66 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -585,52 +585,15 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) rpc_wake_up_next(&xprt->backlog); } -static bool -rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - size_t size, gfp_t flags) +static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb, size_t size, + gfp_t flags) { - struct rpcrdma_regbuf *rb; - - if (likely(rdmab_length(req->rl_sendbuf) >= size)) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); - if (!rb) - return false; - - rpcrdma_free_regbuf(req->rl_sendbuf); - r_xprt->rx_stats.hardway_register_count += size; - req->rl_sendbuf = rb; - return true; -} - -/* The rq_rcv_buf is used only if a Reply chunk is necessary. - * The decision to use a Reply chunk is made later in - * rpcrdma_marshal_req. This buffer is registered at that time. - * - * Otherwise, the associated RPC Reply arrives in a separate - * Receive buffer, arbitrarily chosen by the HCA. The buffer - * allocated here for the RPC Reply is not utilized in that - * case. See rpcrdma_inline_fixup. - * - * A regbuf is used here to remember the buffer size. - */ -static bool -rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - size_t size, gfp_t flags) -{ - struct rpcrdma_regbuf *rb; - - if (likely(rdmab_length(req->rl_recvbuf) >= size)) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); - if (!rb) - return false; - - rpcrdma_free_regbuf(req->rl_recvbuf); - r_xprt->rx_stats.hardway_register_count += size; - req->rl_recvbuf = rb; + if (unlikely(rdmab_length(rb) < size)) { + if (!rpcrdma_regbuf_realloc(rb, size, flags)) + return false; + r_xprt->rx_stats.hardway_register_count += size; + } return true; } @@ -655,9 +618,11 @@ xprt_rdma_allocate(struct rpc_task *task) if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) + if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize, + flags)) goto out_fail; - if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) + if (!rpcrdma_check_regbuf(r_xprt, req->rl_recvbuf, rqst->rq_rcvsize, + flags)) goto out_fail; rqst->rq_buffer = rdmab_data(req->rl_sendbuf); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 77e0f21c9017..734dfe5d18bd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1399,6 +1399,31 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, return rb; } +/** + * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer + * @rb: regbuf to reallocate + * @size: size of buffer to be allocated, in bytes + * @flags: GFP flags + * + * Returns true if reallocation was successful. If false is + * returned, @rb is left untouched. + */ +bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) +{ + void *buf; + + buf = kmalloc(size, flags); + if (!buf) + return false; + + rpcrdma_dma_unmap_regbuf(rb); + kfree(rb->rg_data); + + rb->rg_data = buf; + rb->rg_iov.length = size; + return true; +} + /** * __rpcrdma_map_regbuf - DMA-map a regbuf * @ia: controlling rpcrdma_ia diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 03d5ce443bf0..751d4761d682 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -552,6 +552,8 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, gfp_t); +bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, + gfp_t flags); bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); void rpcrdma_free_regbuf(struct rpcrdma_regbuf *); From d2832af38dfd1d3b135b13c6106b2c5de16a6747 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:32 -0400 Subject: [PATCH 38/69] xprtrdma: Clean up regbuf helpers For code legibility, clean up the function names to be consistent with the pattern: "rpcrdma" _ object-type _ action Also rpcrdma_regbuf_alloc and rpcrdma_regbuf_free no longer have any callers outside of verbs.c, and can thus be made static. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 47 +++++++++--------- net/sunrpc/xprtrdma/verbs.c | 88 +++++++++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 27 ++++++---- 3 files changed, 80 insertions(+), 82 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index cf99c55add1b..231a44b9c152 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -536,22 +536,21 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) /* Prepare an SGE for the RPC-over-RDMA transport header. */ -static bool -rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - u32 len) +static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; struct ib_sge *sge = sc->sc_sges; - if (!rpcrdma_dma_map_regbuf(ia, rb)) + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) goto out_regbuf; sge->addr = rdmab_addr(rb); sge->length = len; sge->lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, - sge->length, DMA_TO_DEVICE); + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); sc->sc_wr.num_sge++; return true; @@ -563,22 +562,21 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, /* Prepare the Send SGEs. The head and tail iovec, and each entry * in the page list, gets its own SGE. */ -static bool -rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) { struct rpcrdma_sendctx *sc = req->rl_sendctx; unsigned int sge_no, page_base, len, remaining; struct rpcrdma_regbuf *rb = req->rl_sendbuf; - struct ib_device *device = ia->ri_device; struct ib_sge *sge = sc->sc_sges; - u32 lkey = ia->ri_pd->local_dma_lkey; struct page *page, **ppages; /* The head iovec is straightforward, as it is already * DMA-mapped. Sync the content that has changed. */ - if (!rpcrdma_dma_map_regbuf(ia, rb)) + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) goto out_regbuf; sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); @@ -626,13 +624,14 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, goto out_mapping_overflow; len = min_t(u32, PAGE_SIZE - page_base, remaining); - sge[sge_no].addr = ib_dma_map_page(device, *ppages, - page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(device, sge[sge_no].addr)) + sge[sge_no].addr = + ib_dma_map_page(rdmab_device(rb), *ppages, + page_base, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), + sge[sge_no].addr)) goto out_mapping_err; sge[sge_no].length = len; - sge[sge_no].lkey = lkey; + sge[sge_no].lkey = rdmab_lkey(rb); sc->sc_unmap_count++; ppages++; @@ -653,13 +652,13 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, map_tail: sge_no++; - sge[sge_no].addr = ib_dma_map_page(device, page, - page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(device, sge[sge_no].addr)) + sge[sge_no].addr = + ib_dma_map_page(rdmab_device(rb), page, page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr)) goto out_mapping_err; sge[sge_no].length = len; - sge[sge_no].lkey = lkey; + sge[sge_no].lkey = rdmab_lkey(rb); sc->sc_unmap_count++; } @@ -707,11 +706,11 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, req->rl_sendctx->sc_req = req; __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); - if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) + if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen)) return -EIO; if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) + if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype)) return -EIO; return 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 734dfe5d18bd..81548fc2ee7f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -76,7 +76,11 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); -static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, + gfp_t flags); +static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); +static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* Wait for outstanding transport work to finish. @@ -437,11 +441,11 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) * mappings and MRs are gone. */ list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) - rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf); + rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); list_for_each_entry(req, &buf->rb_allreqs, rl_all) { - rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf); - rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); - rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); + rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); + rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); + rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); } rpcrdma_mrs_destroy(buf); ib_dealloc_pd(ia->ri_pd); @@ -1014,17 +1018,17 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, if (req == NULL) goto out1; - rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); + rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); if (!rb) goto out2; req->rl_rdmabuf = rb; xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); - req->rl_sendbuf = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); if (!req->rl_sendbuf) goto out3; - req->rl_recvbuf = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); if (!req->rl_recvbuf) goto out4; @@ -1055,7 +1059,7 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(cdata->inline_rsize, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; @@ -1138,7 +1142,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) { - rpcrdma_free_regbuf(rep->rr_rdmabuf); + rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } @@ -1154,9 +1158,9 @@ rpcrdma_req_destroy(struct rpcrdma_req *req) { list_del(&req->rl_all); - rpcrdma_free_regbuf(req->rl_recvbuf); - rpcrdma_free_regbuf(req->rl_sendbuf); - rpcrdma_free_regbuf(req->rl_rdmabuf); + rpcrdma_regbuf_free(req->rl_recvbuf); + rpcrdma_regbuf_free(req->rl_sendbuf); + rpcrdma_regbuf_free(req->rl_rdmabuf); kfree(req); } @@ -1366,20 +1370,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) } } -/** - * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers - * @size: size of buffer to be allocated, in bytes - * @direction: direction of data movement - * @flags: GFP flags - * - * Returns a pointer to a rpcrdma_regbuf object, or NULL. +/* Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for * receiving the payload of RDMA RECV operations. During Long Calls * or Replies they may be registered externally via frwr_map. */ -struct rpcrdma_regbuf * -rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags) { struct rpcrdma_regbuf *rb; @@ -1416,7 +1414,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) if (!buf) return false; - rpcrdma_dma_unmap_regbuf(rb); + rpcrdma_regbuf_dma_unmap(rb); kfree(rb->rg_data); rb->rg_data = buf; @@ -1425,34 +1423,33 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) } /** - * __rpcrdma_map_regbuf - DMA-map a regbuf - * @ia: controlling rpcrdma_ia + * __rpcrdma_regbuf_dma_map - DMA-map a regbuf + * @r_xprt: controlling transport instance * @rb: regbuf to be mapped + * + * Returns true if the buffer is now DMA mapped to @r_xprt's device */ -bool -__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb) { - struct ib_device *device = ia->ri_device; + struct ib_device *device = r_xprt->rx_ia.ri_device; if (rb->rg_direction == DMA_NONE) return false; - rb->rg_iov.addr = ib_dma_map_single(device, - rdmab_data(rb), - rdmab_length(rb), - rb->rg_direction); + rb->rg_iov.addr = ib_dma_map_single(device, rdmab_data(rb), + rdmab_length(rb), rb->rg_direction); if (ib_dma_mapping_error(device, rdmab_addr(rb))) { trace_xprtrdma_dma_maperr(rdmab_addr(rb)); return false; } rb->rg_device = device; - rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; return true; } -static void -rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) +static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb) { if (!rb) return; @@ -1460,19 +1457,14 @@ rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) if (!rpcrdma_regbuf_is_mapped(rb)) return; - ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), - rdmab_length(rb), rb->rg_direction); + ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), rdmab_length(rb), + rb->rg_direction); rb->rg_device = NULL; } -/** - * rpcrdma_free_regbuf - deregister and free registered buffer - * @rb: regbuf to be deregistered and freed - */ -void -rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) +static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) { - rpcrdma_dma_unmap_regbuf(rb); + rpcrdma_regbuf_dma_unmap(rb); if (rb) kfree(rb->rg_data); kfree(rb); @@ -1547,11 +1539,9 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) } rb = rep->rr_rdmabuf; - if (!rpcrdma_regbuf_is_mapped(rb)) { - if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) { - rpcrdma_recv_buffer_put(rep); - break; - } + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) { + rpcrdma_recv_buffer_put(rep); + break; } trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 751d4761d682..b942c9d322a7 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -550,25 +550,34 @@ struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, - gfp_t); bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags); -bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); -void rpcrdma_free_regbuf(struct rpcrdma_regbuf *); +bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb); -static inline bool -rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) +/** + * rpcrdma_regbuf_is_mapped - check if buffer is DMA mapped + * + * Returns true if the buffer is now mapped to rb->rg_device. + */ +static inline bool rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) { return rb->rg_device != NULL; } -static inline bool -rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +/** + * rpcrdma_regbuf_dma_map - DMA-map a regbuf + * @r_xprt: controlling transport instance + * @rb: regbuf to be mapped + * + * Returns true if the buffer is currently DMA mapped. + */ +static inline bool rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb) { if (likely(rpcrdma_regbuf_is_mapped(rb))) return true; - return __rpcrdma_dma_map_regbuf(ia, rb); + return __rpcrdma_regbuf_dma_map(r_xprt, rb); } /* From 3f9c7e76934790c53a48b11c7ad54770cd3ae50d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:37 -0400 Subject: [PATCH 39/69] xprtrdma: Backchannel can use GFP_KERNEL allocations The Receive handler runs in process context, thus can use on-demand GFP_KERNEL allocations instead of pre-allocation. This makes the xprtrdma backchannel independent of the number of backchannel session slots provisioned by the Upper Layer protocol. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 104 ++++++++++++------------------ 1 file changed, 40 insertions(+), 64 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index e1a125ad888d..ae51ef6a897a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -19,35 +19,6 @@ #undef RPCRDMA_BACKCHANNEL_DEBUG -static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, - unsigned int count) -{ - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_req *req; - struct rpc_rqst *rqst; - unsigned int i; - - for (i = 0; i < (count << 1); i++) { - size_t size; - - size = min_t(size_t, r_xprt->rx_data.inline_rsize, PAGE_SIZE); - req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); - if (!req) - return -ENOMEM; - rqst = &req->rl_slot; - - rqst->rq_xprt = xprt; - INIT_LIST_HEAD(&rqst->rq_bc_list); - __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock(&xprt->bc_pa_lock); - list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock(&xprt->bc_pa_lock); - xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), - size); - } - return 0; -} - /** * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests * @xprt: transport associated with these backchannel resources @@ -58,34 +29,10 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; - /* The backchannel reply path returns each rpc_rqst to the - * bc_pa_list _after_ the reply is sent. If the server is - * faster than the client, it can send another backward - * direction request before the rpc_rqst is returned to the - * list. The client rejects the request in this case. - * - * Twice as many rpc_rqsts are prepared to ensure there is - * always an rpc_rqst available as soon as a reply is sent. - */ - if (reqs > RPCRDMA_BACKWARD_WRS >> 1) - goto out_err; - - rc = rpcrdma_bc_setup_reqs(r_xprt, reqs); - if (rc) - goto out_free; - - r_xprt->rx_buf.rb_bc_srv_max_requests = reqs; + r_xprt->rx_buf.rb_bc_srv_max_requests = RPCRDMA_BACKWARD_WRS >> 1; trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; - -out_free: - xprt_rdma_bc_destroy(xprt, reqs); - -out_err: - pr_err("RPC: %s: setup backchannel transport failed\n", __func__); - return -ENOMEM; } /** @@ -213,6 +160,43 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) spin_unlock(&xprt->bc_pa_lock); } +static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + size_t size; + + spin_lock(&xprt->bc_pa_lock); + rqst = list_first_entry_or_null(&xprt->bc_pa_list, struct rpc_rqst, + rq_bc_pa_list); + if (!rqst) + goto create_req; + list_del(&rqst->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); + return rqst; + +create_req: + spin_unlock(&xprt->bc_pa_lock); + + /* Set a limit to prevent a remote from overrunning our resources. + */ + if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) + return NULL; + + size = min_t(size_t, r_xprt->rx_data.inline_rsize, PAGE_SIZE); + req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); + if (!req) + return NULL; + + xprt->bc_alloc_count++; + rqst = &req->rl_slot; + rqst->rq_xprt = xprt; + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), size); + return rqst; +} + /** * rpcrdma_bc_receive_call - Handle a backward direction call * @r_xprt: transport receiving the call @@ -244,18 +228,10 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, pr_info("RPC: %s: %*ph\n", __func__, size, p); #endif - /* Grab a free bc rqst */ - spin_lock(&xprt->bc_pa_lock); - if (list_empty(&xprt->bc_pa_list)) { - spin_unlock(&xprt->bc_pa_lock); + rqst = rpcrdma_bc_rqst_get(r_xprt); + if (!rqst) goto out_overflow; - } - rqst = list_first_entry(&xprt->bc_pa_list, - struct rpc_rqst, rq_bc_pa_list); - list_del(&rqst->rq_bc_pa_list); - spin_unlock(&xprt->bc_pa_lock); - /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; rqst->rq_xid = *p; From 4ba02e8d0ea5461d0e55e76c91481f1153f63365 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:42 -0400 Subject: [PATCH 40/69] xprtrdma: Increase maximum number of backchannel requests Reflects the change introduced in commit 067c46967160 ("NFSv4.1: Bump the default callback session slot count to 16"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b942c9d322a7..7621e2d0f107 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -103,12 +103,14 @@ struct rpcrdma_ep { /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are - * allocated when the forward channel is set up. + * allocated when the forward channel is set up, long before the + * backchannel is provisioned. This value is two times + * NFS4_DEF_CB_SLOT_TABLE_SIZE. */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) -#define RPCRDMA_BACKWARD_WRS (8) +#define RPCRDMA_BACKWARD_WRS (32) #else -#define RPCRDMA_BACKWARD_WRS (0) +#define RPCRDMA_BACKWARD_WRS (0) #endif /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV From 17e4c443c0b433354016df60a7bd3f1c6aac759c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:48 -0400 Subject: [PATCH 41/69] xprtrdma: Trace marshaling failures Record an event when rpcrdma_marshal_req returns a non-zero return value to help track down why an xprt close might have occurred. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 27 +++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 1 + 2 files changed, 28 insertions(+) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 962975b4313f..df9851cb82b2 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -511,6 +511,33 @@ TRACE_EVENT(xprtrdma_marshal, ) ); +TRACE_EVENT(xprtrdma_marshal_failed, + TP_PROTO(const struct rpc_rqst *rqst, + int ret + ), + + TP_ARGS(rqst, ret), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(int, ret) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->ret = ret; + ), + + TP_printk("task:%u@%u xid=0x%08x: ret=%d", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->ret + ) +); + TRACE_EVENT(xprtrdma_post_send, TP_PROTO( const struct rpcrdma_req *req, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 231a44b9c152..45cba06655ea 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -875,6 +875,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) return 0; out_err: + trace_xprtrdma_marshal_failed(rqst, ret); switch (ret) { case -EAGAIN: xprt_wait_for_buffer_space(rqst->rq_xprt); From dbcc53a52df256880c2ffa4ee45661419435998a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:53 -0400 Subject: [PATCH 42/69] xprtrdma: Clean up sendctx functions Minor clean-ups I've stumbled on since sendctx was merged last year. In particular, making Send completion processing more efficient appears to have a measurable impact on IOPS throughput. Note: test_and_clear_bit() returns a value, thus an explicit memory barrier is not necessary. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 27 ++++++++++++--------------- net/sunrpc/xprtrdma/verbs.c | 17 ++++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++-- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 45cba06655ea..5cb060c87543 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -508,30 +508,26 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } /** - * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * rpcrdma_sendctx_unmap - DMA-unmap Send buffer * @sc: sendctx containing SGEs to unmap * */ -void -rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) +void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) { - struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; struct ib_sge *sge; - unsigned int count; /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. */ - sge = &sc->sc_sges[2]; - for (count = sc->sc_unmap_count; count; ++sge, --count) - ib_dma_unmap_page(ia->ri_device, - sge->addr, sge->length, DMA_TO_DEVICE); + for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; + ++sge, --sc->sc_unmap_count) + ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length, + DMA_TO_DEVICE); - if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { - smp_mb__after_atomic(); + if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, + &sc->sc_req->rl_flags)) wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); - } } /* Prepare an SGE for the RPC-over-RDMA transport header. @@ -578,6 +574,7 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, */ if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) goto out_regbuf; + sc->sc_device = rdmab_device(rb); sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; @@ -673,12 +670,12 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, return false; out_mapping_overflow: - rpcrdma_unmap_sendctx(sc); + rpcrdma_sendctx_unmap(sc); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); return false; out_mapping_err: - rpcrdma_unmap_sendctx(sc); + rpcrdma_sendctx_unmap(sc); trace_xprtrdma_dma_maperr(sge[sge_no].addr); return false; } @@ -698,7 +695,7 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { - req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); if (!req->rl_sendctx) return -EAGAIN; req->rl_sendctx->sc_wr.num_sge = 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 81548fc2ee7f..1ad25190f2d1 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -870,20 +870,20 @@ static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, /** * rpcrdma_sendctx_get_locked - Acquire a send context - * @buf: transport buffers from which to acquire an unused context + * @r_xprt: controlling transport instance * * Returns pointer to a free send completion context; or NULL if * the queue is empty. * * Usage: Called to acquire an SGE array before preparing a Send WR. * - * The caller serializes calls to this function (per rpcrdma_buffer), - * and provides an effective memory barrier that flushes the new value + * The caller serializes calls to this function (per transport), and + * provides an effective memory barrier that flushes the new value * of rb_sc_head. */ -struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_sendctx *sc; unsigned long next_head; @@ -908,7 +908,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) * backing up. Cause the caller to pause and try again. */ set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags); - r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); r_xprt->rx_stats.empty_sendctx_q++; return NULL; } @@ -920,7 +919,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) * Usage: Called from Send completion to return a sendctxt * to the queue. * - * The caller serializes calls to this function (per rpcrdma_buffer). + * The caller serializes calls to this function (per transport). */ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) @@ -928,7 +927,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; unsigned long next_tail; - /* Unmap SGEs of previously completed by unsignaled + /* Unmap SGEs of previously completed but unsignaled * Sends by walking up the queue until @sc is found. */ next_tail = buf->rb_sc_tail; @@ -936,7 +935,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) next_tail = rpcrdma_sendctx_next(buf, next_tail); /* ORDER: item must be accessed _before_ tail is updated */ - rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]); + rpcrdma_sendctx_unmap(buf->rb_sc_ctxs[next_tail]); } while (buf->rb_sc_ctxs[next_tail] != sc); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 7621e2d0f107..8afb5fc1de05 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -225,6 +225,7 @@ struct rpcrdma_xprt; struct rpcrdma_sendctx { struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; + struct ib_device *sc_device; struct rpcrdma_xprt *sc_xprt; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; @@ -536,7 +537,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); -struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); @@ -625,7 +626,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype); -void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); +void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); From c209e49ceac0ff479f79ac5cd2fbf8be80621203 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:39:59 -0400 Subject: [PATCH 43/69] xprtrdma: More Send completion batching Instead of using a fixed number, allow the amount of Send completion batching to vary based on the client's maximum credit limit. - A larger default gives a small boost to IOPS throughput - Reducing it based on max_requests gives a safe result when the max credit limit is cranked down (eg. when the device has a small max_qp_wr). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 +--- net/sunrpc/xprtrdma/xprt_rdma.h | 10 ---------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1ad25190f2d1..1ed761a12f86 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -521,9 +521,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_send_sge, ep->rep_attr.cap.max_recv_sge); - /* set trigger for requesting send completion */ - ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, - cdata->max_requests >> 2); + ep->rep_send_batch = cdata->max_requests >> 3; ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8afb5fc1de05..f8563937c8c6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -232,16 +232,6 @@ struct rpcrdma_sendctx { struct ib_sge sc_sges[]; }; -/* Limit the number of SGEs that can be unmapped during one - * Send completion. This caps the amount of work a single - * completion can do before returning to the provider. - * - * Setting this to zero disables Send completion batching. - */ -enum { - RPCRDMA_MAX_SEND_BATCH = 7, -}; - /* * struct rpcrdma_mr - external memory region metadata * From f19bd0bbd363fb97756ed83f53f48413d3e601aa Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:04 -0400 Subject: [PATCH 44/69] xprtrdma: Eliminate rpcrdma_ia::ri_device Clean up. Since commit 54cbd6b0c6b9 ("xprtrdma: Delay DMA mapping Send and Receive buffers"), a pointer to the device is now saved in each regbuf when it is DMA mapped. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++++-------- net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 7 +++---- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index a2a2e01cb5dd..7cd27184ecd1 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -82,13 +82,13 @@ /** * frwr_is_supported - Check if device supports FRWR - * @ia: interface adapter to check + * @device: interface adapter to check * * Returns true if device supports FRWR, otherwise false */ -bool frwr_is_supported(struct rpcrdma_ia *ia) +bool frwr_is_supported(struct ib_device *device) { - struct ib_device_attr *attrs = &ia->ri_device->attrs; + struct ib_device_attr *attrs = &device->attrs; if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) goto out_not_supported; @@ -98,7 +98,7 @@ bool frwr_is_supported(struct rpcrdma_ia *ia) out_not_supported: pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", - ia->ri_device->name); + device->name); return false; } @@ -131,7 +131,7 @@ frwr_mr_recycle_worker(struct work_struct *work) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -211,7 +211,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *attrs = &ia->ri_device->attrs; + struct ib_device_attr *attrs = &ia->ri_id->device->attrs; int max_qp_wr, depth, delta; ia->ri_mrtype = IB_MR_TYPE_MEM_REG; @@ -253,7 +253,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } while (delta > 0); } - max_qp_wr = ia->ri_device->attrs.max_qp_wr; + max_qp_wr = ia->ri_id->device->attrs.max_qp_wr; max_qp_wr -= RPCRDMA_BACKWARD_WRS; max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) @@ -436,7 +436,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, } mr->mr_dir = rpcrdma_data_dir(writing); - mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); + mr->mr_nents = + ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1ed761a12f86..672993cee70d 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -250,7 +250,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_device->name, + ia->ri_id->device->name, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); @@ -259,7 +259,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; - ia->ri_device = NULL; /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: @@ -294,7 +293,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_device->name, rdma_event_msg(event->event)); + ia->ri_id->device->name, rdma_event_msg(event->event)); return 0; } @@ -373,9 +372,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) rc = PTR_ERR(ia->ri_id); goto out_err; } - ia->ri_device = ia->ri_id->device; - ia->ri_pd = ib_alloc_pd(ia->ri_device, 0); + ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); @@ -384,12 +382,12 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRWR: - if (frwr_is_supported(ia)) + if (frwr_is_supported(ia->ri_id->device)) break; /*FALLTHROUGH*/ default: pr_err("rpcrdma: Device %s does not support memreg mode %d\n", - ia->ri_device->name, xprt_rdma_memreg_strategy); + ia->ri_id->device->name, xprt_rdma_memreg_strategy); rc = -EINVAL; goto out_err; } @@ -471,7 +469,6 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) rdma_destroy_id(ia->ri_id); } ia->ri_id = NULL; - ia->ri_device = NULL; /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) @@ -491,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, unsigned int max_sge; int rc; - max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge, + max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge, RPCRDMA_MAX_SEND_SGES); if (max_sge < RPCRDMA_MIN_SEND_SGES) { pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); @@ -526,16 +523,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; - sendcq = ib_alloc_cq(ia->ri_device, NULL, + sendcq = ib_alloc_cq(ia->ri_id->device, NULL, ep->rep_attr.cap.max_send_wr + 1, - ia->ri_device->num_comp_vectors > 1 ? 1 : 0, + ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); goto out1; } - recvcq = ib_alloc_cq(ia->ri_device, NULL, + recvcq = ib_alloc_cq(ia->ri_id->device, NULL, ep->rep_attr.cap.max_recv_wr + 1, 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { @@ -561,7 +558,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.responder_resources = - min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom); + min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -673,7 +670,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, */ old = id; rc = -ENETUNREACH; - if (ia->ri_device != id->device) { + if (ia->ri_id->device != id->device) { pr_err("rpcrdma: can't reconnect on different device!\n"); goto out_destroy; } @@ -1296,7 +1293,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1429,7 +1426,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ia.ri_device; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; if (rb->rg_direction == DMA_NONE) return false; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f8563937c8c6..40912bb34b64 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -66,11 +66,8 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { - struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct completion ri_done; - struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; unsigned int ri_max_frwr_depth; @@ -80,6 +77,8 @@ struct rpcrdma_ia { bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; + struct completion ri_done; + struct completion ri_remove_done; }; enum { @@ -585,7 +584,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ -bool frwr_is_supported(struct rpcrdma_ia *); +bool frwr_is_supported(struct ib_device *device); int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); From 1f7d1c73c58a3d07a951ce23acfb4ec91a31d1e9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:09 -0400 Subject: [PATCH 45/69] SUNRPC: Update comments based on recent changes Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index bc1c8247750d..a9d40bc7ebed 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -955,7 +955,7 @@ xprt_is_pinned_rqst(struct rpc_rqst *req) * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() - * so should be holding the xprt receive lock. + * so should be holding xprt->queue_lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { @@ -967,7 +967,7 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst); * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * - * Caller should be holding the xprt receive lock. + * Caller should be holding xprt->queue_lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { From fd5951742dbc8c3695151e3f46b2fe2c4dac3559 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:15 -0400 Subject: [PATCH 46/69] xprtrdma: Remove rpcrdma_create_data_internal::rsize and wsize Clean up. xprt_rdma_max_inline_{read,write} cannot be set to large values by virtue of proc_dointvec_minmax. The current maximum is RPCRDMA_MAX_INLINE, which is much smaller than RPCRDMA_MAX_SEGS * PAGE_SIZE. The .rsize and .wsize fields are otherwise unused in the current code base, and thus can be removed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 9 --------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 -- 2 files changed, 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8cf4fa36ed66..6b7c84166d13 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -350,17 +350,8 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); cdata.max_requests = xprt_rdma_slot_table_entries; - - cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ - cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ - cdata.inline_wsize = xprt_rdma_max_inline_write; - if (cdata.inline_wsize > cdata.wsize) - cdata.inline_wsize = cdata.wsize; - cdata.inline_rsize = xprt_rdma_max_inline_read; - if (cdata.inline_rsize > cdata.rsize) - cdata.inline_rsize = cdata.rsize; /* * Create new transport instance, which includes initialized diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 40912bb34b64..d34371d0d0f8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -419,8 +419,6 @@ enum { */ struct rpcrdma_create_data_internal { unsigned int max_requests; /* max requests (slots) in flight */ - unsigned int rsize; /* mount rsize - max read hdr+data */ - unsigned int wsize; /* mount wsize - max write hdr+data */ unsigned int inline_rsize; /* max non-rdma read data payload */ unsigned int inline_wsize; /* max non-rdma write data payload */ }; From 94087e978e9b645e07cc0fbdcf4140dda02f3d81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:20 -0400 Subject: [PATCH 47/69] xprtrdma: Aggregate the inline settings in struct rpcrdma_ep Clean up. The inline settings are actually a characteristic of the endpoint, and not related to the device. They are also modified after the transport instance is created, so they do not belong in the cdata structure either. Lastly, let's use names that are more natural to RDMA than to NFS: inline_write -> inline_send and inline_read -> inline_recv. The /proc files retain their names to avoid breaking user space. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 6 +++--- net/sunrpc/xprtrdma/rpc_rdma.c | 32 +++++++++++++++++-------------- net/sunrpc/xprtrdma/transport.c | 4 +--- net/sunrpc/xprtrdma/verbs.c | 24 ++++++++++++----------- net/sunrpc/xprtrdma/xprt_rdma.h | 9 +++++---- 5 files changed, 40 insertions(+), 35 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ae51ef6a897a..ce986591f213 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,10 +44,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); + maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -184,7 +184,7 @@ static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_data.inline_rsize, PAGE_SIZE); + size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 5cb060c87543..85115a2e2639 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -105,16 +105,23 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) return size; } +/** + * rpcrdma_set_max_header_sizes - Initialize inline payload sizes + * @r_xprt: transport instance to initialize + * + * The max_inline fields contain the maximum size of an RPC message + * so the marshaling code doesn't have to repeat this calculation + * for every RPC. + */ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - unsigned int maxsegs = ia->ri_max_segs; + unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; - ia->ri_max_inline_write = cdata->inline_wsize - - rpcrdma_max_call_header_size(maxsegs); - ia->ri_max_inline_read = cdata->inline_rsize - - rpcrdma_max_reply_header_size(maxsegs); + ep->rep_max_inline_send = + ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->rep_max_inline_recv = + ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -131,7 +138,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdr = &rqst->rq_snd_buf; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) return false; if (xdr->page_len) { @@ -159,9 +166,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -173,10 +178,9 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct rpc_rqst *rqst) { const struct xdr_buf *buf = &rqst->rq_rcv_buf; - const struct rpcrdma_ia *ia = &r_xprt->rx_ia; - return buf->head[0].iov_len + buf->tail[0].iov_len < - ia->ri_max_inline_read; + return (buf->head[0].iov_len + buf->tail[0].iov_len) < + r_xprt->rx_ep.rep_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6b7c84166d13..b37a3e0f6728 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,7 +70,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; -static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; @@ -350,8 +350,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); cdata.max_requests = xprt_rdma_slot_table_entries; - cdata.inline_wsize = xprt_rdma_max_inline_write; - cdata.inline_rsize = xprt_rdma_max_inline_read; /* * Create new transport instance, which includes initialized diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 672993cee70d..9e24ca502430 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -188,7 +188,6 @@ static void rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, struct rdma_conn_param *param) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; const struct rpcrdma_connect_private *pmsg = param->private_data; unsigned int rsize, wsize; @@ -205,12 +204,13 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < cdata->inline_rsize) - cdata->inline_rsize = rsize; - if (wsize < cdata->inline_wsize) - cdata->inline_wsize = wsize; - dprintk("RPC: %s: max send %u, max recv %u\n", - __func__, cdata->inline_wsize, cdata->inline_rsize); + if (rsize < r_xprt->rx_ep.rep_inline_recv) + r_xprt->rx_ep.rep_inline_recv = rsize; + if (wsize < r_xprt->rx_ep.rep_inline_send) + r_xprt->rx_ep.rep_inline_send = wsize; + dprintk("RPC: %s: max send %u, max recv %u\n", __func__, + r_xprt->rx_ep.rep_inline_send, + r_xprt->rx_ep.rep_inline_recv); rpcrdma_set_max_header_sizes(r_xprt); } @@ -488,6 +488,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, unsigned int max_sge; int rc; + ep->rep_inline_send = xprt_rdma_max_inline_write; + ep->rep_inline_recv = xprt_rdma_max_inline_read; + max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge, RPCRDMA_MAX_SEND_SGES); if (max_sge < RPCRDMA_MIN_SEND_SGES) { @@ -550,8 +553,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); ep->rep_remote_cma.private_data = pmsg; ep->rep_remote_cma.private_data_len = sizeof(*pmsg); @@ -1045,7 +1048,6 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; @@ -1053,7 +1055,7 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(cdata->inline_rsize, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index d34371d0d0f8..edf602afb08a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -71,8 +71,6 @@ struct rpcrdma_ia { int ri_async_rc; unsigned int ri_max_segs; unsigned int ri_max_frwr_depth; - unsigned int ri_max_inline_write; - unsigned int ri_max_inline_read; unsigned int ri_max_send_sges; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; @@ -92,11 +90,15 @@ enum { struct rpcrdma_ep { unsigned int rep_send_count; unsigned int rep_send_batch; + unsigned int rep_max_inline_send; + unsigned int rep_max_inline_recv; int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; + unsigned int rep_inline_send; /* negotiated */ + unsigned int rep_inline_recv; /* negotiated */ int rep_receive_count; }; @@ -419,8 +421,6 @@ enum { */ struct rpcrdma_create_data_internal { unsigned int max_requests; /* max requests (slots) in flight */ - unsigned int inline_rsize; /* max non-rdma read data payload */ - unsigned int inline_wsize; /* max non-rdma write data payload */ }; /* @@ -631,6 +631,7 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) /* RPC/RDMA module init - xprtrdma/transport.c */ extern unsigned int xprt_rdma_max_inline_read; +extern unsigned int xprt_rdma_max_inline_write; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); void xprt_rdma_close(struct rpc_xprt *xprt); From 86c4ccd9b92ba6541fc4734e82f87139deea0470 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:25 -0400 Subject: [PATCH 48/69] xprtrdma: Eliminate struct rpcrdma_create_data_internal Clean up. Move the remaining field in rpcrdma_create_data_internal so the structure can be removed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 21 +++++++-------- net/sunrpc/xprtrdma/transport.c | 27 +++---------------- net/sunrpc/xprtrdma/verbs.c | 47 ++++++++++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 22 ++++----------- 4 files changed, 46 insertions(+), 71 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 7cd27184ecd1..1d369b65e845 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -194,12 +194,11 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) * frwr_open - Prepare an endpoint for use with FRWR * @ia: interface adapter this endpoint will use * @ep: endpoint to prepare - * @cdata: transport parameters * * On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr - * cdata->max_requests + * ep->rep_max_requests * ia->ri_max_segs * * And these FRWR-related fields: @@ -208,8 +207,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) * * On failure, a negative errno is returned. */ -int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) { struct ib_device_attr *attrs = &ia->ri_id->device->attrs; int max_qp_wr, depth, delta; @@ -258,19 +256,18 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (cdata->max_requests > max_qp_wr) - cdata->max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; + if (ep->rep_max_requests > max_qp_wr) + ep->rep_max_requests = max_qp_wr; + ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - cdata->max_requests = max_qp_wr / depth; - if (!cdata->max_requests) + ep->rep_max_requests = max_qp_wr / depth; + if (!ep->rep_max_requests) return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; + ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; } ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index b37a3e0f6728..1f73a6a7e43c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -68,7 +68,7 @@ * tunables */ -static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; @@ -288,7 +288,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_destroy(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -311,10 +311,8 @@ static const struct rpc_timeout xprt_rdma_default_timeout = { static struct rpc_xprt * xprt_setup_rdma(struct xprt_create *args) { - struct rpcrdma_create_data_internal cdata; struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; - struct rpcrdma_ep *new_ep; struct sockaddr *sap; int rc; @@ -349,29 +347,12 @@ xprt_setup_rdma(struct xprt_create *args) xprt_set_bound(xprt); xprt_rdma_format_addresses(xprt, sap); - cdata.max_requests = xprt_rdma_slot_table_entries; - - /* - * Create new transport instance, which includes initialized - * o ia - * o endpoint - * o buffers - */ - new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); if (rc) goto out1; - /* - * initialize and create ep - */ - new_xprt->rx_data = cdata; - new_ep = &new_xprt->rx_ep; - - rc = rpcrdma_ep_create(&new_xprt->rx_ep, - &new_xprt->rx_ia, &new_xprt->rx_data); + rc = rpcrdma_ep_create(new_xprt); if (rc) goto out2; @@ -402,7 +383,7 @@ xprt_setup_rdma(struct xprt_create *args) rpcrdma_buffer_destroy(&new_xprt->rx_buf); rc = -ENODEV; out3: - rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); + rpcrdma_ep_destroy(new_xprt); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9e24ca502430..0d0c3356f34e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -476,18 +476,22 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) ia->ri_pd = NULL; } -/* - * Create unconnected endpoint. +/** + * rpcrdma_ep_create - Create unconnected endpoint + * @r_xprt: transport to instantiate + * + * Returns zero on success, or a negative errno. */ -int -rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata) +int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; unsigned int max_sge; int rc; + ep->rep_max_requests = xprt_rdma_slot_table_entries; ep->rep_inline_send = xprt_rdma_max_inline_write; ep->rep_inline_recv = xprt_rdma_max_inline_read; @@ -499,7 +503,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } ia->ri_max_send_sges = max_sge; - rc = frwr_open(ia, ep, cdata); + rc = frwr_open(ia, ep); if (rc) return rc; @@ -521,7 +525,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_send_sge, ep->rep_attr.cap.max_recv_sge); - ep->rep_send_batch = cdata->max_requests >> 3; + ep->rep_send_batch = ep->rep_max_requests >> 3; ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; @@ -584,16 +588,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, return rc; } -/* - * rpcrdma_ep_destroy +/** + * rpcrdma_ep_destroy - Disconnect and destroy endpoint. + * @r_xprt: transport instance to shut down * - * Disconnect and destroy endpoint. After this, the only - * valid operations on the ep are to free it (if dynamically - * allocated) or re-create it. */ -void -rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); @@ -623,7 +627,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, goto out1; rc = -ENOMEM; - err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data); + err = rpcrdma_ep_create(r_xprt); if (err) { pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); goto out2; @@ -640,7 +644,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, return 0; out3: - rpcrdma_ep_destroy(ep, ia); + rpcrdma_ep_destroy(r_xprt); out2: rpcrdma_ia_close(ia); out1: @@ -1082,14 +1086,19 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) return false; } -int -rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_buffer_create - Create initial set of req/rep objects + * @r_xprt: transport instance to (re)initialize + * + * Returns zero on success, otherwise a negative errno. + */ +int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; buf->rb_flags = 0; - buf->rb_max_requests = r_xprt->rx_data.max_requests; + buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index edf602afb08a..9e98ee0cd937 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -97,6 +97,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; + unsigned int rep_max_requests; /* set by /proc */ unsigned int rep_inline_send; /* negotiated */ unsigned int rep_inline_recv; /* negotiated */ int rep_receive_count; @@ -413,16 +414,6 @@ enum { RPCRDMA_BUF_F_EMPTY_SCQ = 0, }; -/* - * Internal structure for transport instance creation. This - * exists primarily for modularity. - * - * This data should be set with mount options - */ -struct rpcrdma_create_data_internal { - unsigned int max_requests; /* max requests (slots) in flight */ -}; - /* * Statistics for RPCRDMA */ @@ -467,13 +458,11 @@ struct rpcrdma_xprt { struct rpcrdma_ia rx_ia; struct rpcrdma_ep rx_ep; struct rpcrdma_buffer rx_buf; - struct rpcrdma_create_data_internal rx_data; struct delayed_work rx_connect_worker; struct rpcrdma_stats rx_stats; }; #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) -#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) static inline const char * rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt) @@ -507,9 +496,8 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *); -void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt); +void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); @@ -583,8 +571,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ bool frwr_is_supported(struct ib_device *device); -int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata); +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); @@ -630,6 +617,7 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_slot_table_entries; extern unsigned int xprt_rdma_max_inline_read; extern unsigned int xprt_rdma_max_inline_write; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); From 5f2311f5bd359d6d810922bf25c238053a449f2a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:31 -0400 Subject: [PATCH 49/69] xprtrdma: Remove pr_err() call sites from completion handlers Clean up: rely on the trace points instead. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 23 ++++------------------- net/sunrpc/xprtrdma/verbs.c | 9 --------- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1d369b65e845..794ba4ca0994 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -297,15 +297,6 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth); } -static void -__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) -{ - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("rpcrdma: %s: %s (%u/0x%x)\n", - wr, ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); -} - /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC * @cq: completion queue (ignored) @@ -320,10 +311,8 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frwr->fr_state = FRWR_FLUSHED_FR; - __frwr_sendcompletion_flush(wc, "fastreg"); - } trace_xprtrdma_wc_fastreg(wc, frwr); } @@ -341,10 +330,8 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frwr->fr_state = FRWR_FLUSHED_LI; - __frwr_sendcompletion_flush(wc, "localinv"); - } trace_xprtrdma_wc_li(wc, frwr); } @@ -363,12 +350,10 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frwr->fr_state = FRWR_FLUSHED_LI; - __frwr_sendcompletion_flush(wc, "localinv"); - } - complete(&frwr->fr_linv_done); trace_xprtrdma_wc_li_wake(wc, frwr); + complete(&frwr->fr_linv_done); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0d0c3356f34e..fcbcd4afaa5a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -135,11 +135,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); - if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("rpcrdma: Send: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - rpcrdma_sendctx_put_locked(sc); } @@ -177,10 +172,6 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); rpcrdma_recv_buffer_put(rep); } From b8fe677fd059deadb2f7f71c4dea747be84d75e0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:36 -0400 Subject: [PATCH 50/69] xprtrdma: Update comments that reference ib_drain_qp Commit e1ede312f17e ("xprtrdma: Fix helper that drains the transport") replaced the ib_drain_qp() call, so update documenting comments to reflect current operation. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index fcbcd4afaa5a..bef5eac8ab38 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -83,7 +83,9 @@ static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); -/* Wait for outstanding transport work to finish. +/* Wait for outstanding transport work to finish. ib_drain_qp + * handles the drains in the wrong order for us, so open code + * them here. */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { @@ -792,8 +794,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) */ /* rpcrdma_sendctxs_destroy() assumes caller has already quiesced - * queue activity, and ib_drain_qp has flushed all remaining Send - * requests. + * queue activity, and rpcrdma_xprt_drain has flushed all remaining + * Send requests. */ static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) { @@ -1194,7 +1196,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) * rpcrdma_buffer_destroy - Release all hw resources * @buf: root control block for resources * - * ORDERING: relies on a prior ib_drain_qp : + * ORDERING: relies on a prior rpcrdma_xprt_drain : * - No more Send or Receive completions can occur * - All MRs, reps, and reqs are returned to their free lists */ From 2cfd11f16f01c0ee8f83bb07027c9d2f43565473 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 24 Apr 2019 09:40:41 -0400 Subject: [PATCH 51/69] xprtrdma: Remove stale comment The comment hasn't been accurate for several years. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9e98ee0cd937..d1e0749bcbc4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -239,13 +239,6 @@ struct rpcrdma_sendctx { * * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). - * - * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During - * call_allocate, rpcrdma_buffer_get() assigns one to each segment in - * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep - * track of registration metadata while each RPC is pending. - * rpcrdma_deregister_external() uses this metadata to unmap and - * release these resources when an RPC is complete. */ enum rpcrdma_frwr_state { FRWR_IS_INVALID, /* ready to be used */ From 79caa5fad47c69874f9efc4ac3128cc3f6d36f6e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:42 -0400 Subject: [PATCH 52/69] SUNRPC: Cache cred of process creating the rpc_client When converting kuids to AUTH_UNIX creds, etc we will want to use the same user namespace as the process that created the rpc client. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/lockd/host.c | 1 + fs/lockd/mon.c | 1 + fs/nfs/client.c | 1 + fs/nfs/mount_clnt.c | 2 ++ fs/nfsd/nfs4callback.c | 1 + include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 7 +++++++ net/sunrpc/rpcb_clnt.c | 9 +++++++-- 8 files changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index f0b5c987d6ae..d46081123f7c 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -458,6 +458,7 @@ nlm_bind_host(struct nlm_host *host) .authflavor = RPC_AUTH_UNIX, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_AUTOBIND), + .cred = current_cred(), }; /* diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 654594ef4f94..1eabd91870e6 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -82,6 +82,7 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename) .version = NSM_VERSION, .authflavor = RPC_AUTH_NULL, .flags = RPC_CLNT_CREATE_NOPING, + .cred = current_cred(), }; return rpc_create(&args); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f74638c5e5b4..a843cf3f6340 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -500,6 +500,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, + .cred = current_cred(), }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index d979ff4fee7e..cb7c10e9721e 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -163,6 +163,7 @@ int nfs_mount(struct nfs_mount_request *info) .program = &mnt_program, .version = info->version, .authflavor = RPC_AUTH_UNIX, + .cred = current_cred(), }; struct rpc_clnt *mnt_clnt; int status; @@ -249,6 +250,7 @@ void nfs_umount(const struct nfs_mount_request *info) .version = info->version, .authflavor = RPC_AUTH_UNIX, .flags = RPC_CLNT_CREATE_NOPING, + .cred = current_cred(), }; struct rpc_message msg = { .rpc_argp = info->dirpath, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f7494be8dbe2..3a10399a0ef1 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -868,6 +868,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .program = &cb_program, .version = 1, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .cred = current_cred(), }; struct rpc_clnt *client; const struct cred *cred; diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 943762acfcd4..6e8073140a5d 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -72,6 +72,7 @@ struct rpc_clnt { struct dentry *cl_debugfs; /* debugfs directory */ #endif struct rpc_xprt_iter cl_xpi; + const struct cred *cl_cred; }; /* @@ -126,6 +127,7 @@ struct rpc_create_args { unsigned long flags; char *client_name; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ + const struct cred *cred; }; struct rpc_add_xprt_test { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e933f1185317..369a2648dafc 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -394,6 +394,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (err) goto out_no_clid; + clnt->cl_cred = get_cred(args->cred); clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; @@ -439,6 +440,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, out_no_path: rpc_free_iostats(clnt->cl_metrics); out_no_stats: + put_cred(clnt->cl_cred); rpc_free_clid(clnt); out_no_clid: kfree(clnt); @@ -631,6 +633,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_principal = clnt->cl_principal; + new->cl_cred = get_cred(clnt->cl_cred); return new; out_err: @@ -652,6 +655,7 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = clnt->cl_auth->au_flavor, + .cred = clnt->cl_cred, }; return __rpc_clone_client(&args, clnt); } @@ -673,6 +677,7 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = flavor, + .cred = clnt->cl_cred, }; return __rpc_clone_client(&args, clnt); } @@ -880,6 +885,7 @@ rpc_free_client(struct rpc_clnt *clnt) xprt_put(rcu_dereference_raw(clnt->cl_xprt)); xprt_iter_destroy(&clnt->cl_xpi); rpciod_down(); + put_cred(clnt->cl_cred); rpc_free_clid(clnt); kfree(clnt); return parent; @@ -944,6 +950,7 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, .prognumber = program->number, .version = vers, .authflavor = old->cl_auth->au_flavor, + .cred = old->cl_cred, }; struct rpc_clnt *clnt; int err; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 18b0cf2a923f..2277b7cdad27 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -240,6 +240,7 @@ static int rpcb_create_local_unix(struct net *net) .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_NULL, + .cred = current_cred(), /* * We turn off the idle timeout to prevent the kernel * from automatically disconnecting the socket. @@ -299,6 +300,7 @@ static int rpcb_create_local_net(struct net *net) .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_UNIX, + .cred = current_cred(), .flags = RPC_CLNT_CREATE_NOPING, }; struct rpc_clnt *clnt, *clnt4; @@ -358,7 +360,8 @@ int rpcb_create_local(struct net *net) static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, const char *hostname, struct sockaddr *srvaddr, size_t salen, - int proto, u32 version) + int proto, u32 version, + const struct cred *cred) { struct rpc_create_args args = { .net = net, @@ -370,6 +373,7 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, .program = &rpcb_program, .version = version, .authflavor = RPC_AUTH_UNIX, + .cred = cred, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_NONPRIVPORT), }; @@ -745,7 +749,8 @@ void rpcb_getport_async(struct rpc_task *task) rpcb_clnt = rpcb_create(xprt->xprt_net, clnt->cl_nodename, xprt->servername, sap, salen, - xprt->prot, bind_version); + xprt->prot, bind_version, + clnt->cl_cred); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", From 1a58e8a0e5c1f188a80eb9e505bc77d78a31a4ec Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:43 -0400 Subject: [PATCH 53/69] NFS: Store the credential of the mount process in the nfs_server Store the credential of the mount process so that we can determine information such as the user namespace. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 8 +++++++- fs/nfs/internal.h | 1 + fs/nfs/nfs3client.c | 1 + fs/nfs/nfs4client.c | 6 ++++++ include/linux/nfs_fs_sb.h | 3 +++ 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a843cf3f6340..e3baa9f1da76 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -500,7 +500,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, - .cred = current_cred(), + .cred = cl_init->cred, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) @@ -655,6 +655,7 @@ static int nfs_init_server(struct nfs_server *server, .proto = data->nfs_server.protocol, .net = data->net, .timeparms = &timeparms, + .cred = server->cred, }; struct nfs_client *clp; int error; @@ -923,6 +924,7 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); + put_cred(server->cred); kfree(server); nfs_release_automount_timer(); } @@ -943,6 +945,8 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); + server->cred = get_cred(current_cred()); + error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -1009,6 +1013,8 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (!server) return ERR_PTR(-ENOMEM); + server->cred = get_cred(source->cred); + error = -ENOMEM; fattr_fsinfo = nfs_alloc_fattr(); if (fattr_fsinfo == NULL) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 196534634c3a..22232e76df47 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -84,6 +84,7 @@ struct nfs_client_initdata { u32 minorversion; struct net *net; const struct rpc_timeout *timeparms; + const struct cred *cred; }; /* diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 7879f2a0fcfd..1afdb0f7473f 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -91,6 +91,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .proto = ds_proto, .net = mds_clp->cl_net, .timeparms = &ds_timeout, + .cred = mds_srv->cred, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 1339ede979af..3ce246346f02 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -870,6 +870,7 @@ static int nfs4_set_client(struct nfs_server *server, .minorversion = minorversion, .net = net, .timeparms = timeparms, + .cred = server->cred, }; struct nfs_client *clp; @@ -931,6 +932,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .minorversion = minor_version, .net = mds_clp->cl_net, .timeparms = &ds_timeout, + .cred = mds_srv->cred, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -1107,6 +1109,8 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); + server->cred = get_cred(current_cred()); + auth_probe = mount_info->parsed->auth_info.flavor_len < 1; /* set up the general RPC client */ @@ -1143,6 +1147,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, parent_server = NFS_SB(data->sb); parent_client = parent_server->nfs_client; + server->cred = get_cred(parent_server->cred); + /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 0fbc5d3c5e53..1e78032a174b 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -241,6 +241,9 @@ struct nfs_server { /* XDR related information */ unsigned int read_hdrsize; + + /* User namespace info */ + const struct cred *cred; }; /* Server capabilities */ From 283ebe3ec4157c5cdc2581ed7e5c3764137f8fe5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:44 -0400 Subject: [PATCH 54/69] SUNRPC: Use the client user namespace when encoding creds When encoding AUTH_UNIX creds and AUTH_GSS upcalls, use the user namespace of the process that created the rpc client. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/auth_gss.c | 11 ++++++++--- net/sunrpc/auth_unix.c | 9 +++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index c055edfec55e..5dc2aef2232a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -412,7 +412,10 @@ gss_upcall_callback(struct rpc_task *task) static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) { - uid_t uid = from_kuid(&init_user_ns, gss_msg->uid); + struct user_namespace *userns = gss_msg->auth->client->cl_cred ? + gss_msg->auth->client->cl_cred->user_ns : &init_user_ns; + + uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); @@ -424,13 +427,15 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, const char *target_name) { + struct user_namespace *userns = gss_msg->auth->client->cl_cred ? + gss_msg->auth->client->cl_cred->user_ns : &init_user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); int len; len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name, - from_kuid(&init_user_ns, gss_msg->uid)); + from_kuid_munged(userns, gss_msg->uid)); buflen -= len; p += len; gss_msg->msg.len = len; @@ -706,7 +711,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) goto err; } - uid = make_kuid(&init_user_ns, id); + uid = make_kuid(current_user_ns(), id); if (!uid_valid(uid)) { err = -EINVAL; goto err; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index d4018e5a24c5..e7df1f782b2e 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -107,6 +107,8 @@ unx_marshal(struct rpc_task *task, struct xdr_stream *xdr) __be32 *p, *cred_len, *gidarr_len; int i; struct group_info *gi = cred->cr_cred->group_info; + struct user_namespace *userns = clnt->cl_cred ? + clnt->cl_cred->user_ns : &init_user_ns; /* Credential */ @@ -122,14 +124,13 @@ unx_marshal(struct rpc_task *task, struct xdr_stream *xdr) p = xdr_reserve_space(xdr, 3 * sizeof(*p)); if (!p) goto marshal_failed; - *p++ = cpu_to_be32(from_kuid(&init_user_ns, cred->cr_cred->fsuid)); - *p++ = cpu_to_be32(from_kgid(&init_user_ns, cred->cr_cred->fsgid)); + *p++ = cpu_to_be32(from_kuid_munged(userns, cred->cr_cred->fsuid)); + *p++ = cpu_to_be32(from_kgid_munged(userns, cred->cr_cred->fsgid)); gidarr_len = p++; if (gi) for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++) - *p++ = cpu_to_be32(from_kgid(&init_user_ns, - gi->gid[i])); + *p++ = cpu_to_be32(from_kgid_munged(userns, gi->gid[i])); *gidarr_len = cpu_to_be32(p - gidarr_len - 1); *cred_len = cpu_to_be32((p - cred_len - 1) << 2); p = xdr_reserve_space(xdr, (p - gidarr_len - 1) << 2); From ac83228a7101e655ba5a7fa61ae10b058ada15db Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:45 -0400 Subject: [PATCH 55/69] SUNRPC: Use namespace of listening daemon in the client AUTH_GSS upcall When the client needs to talk to rpc.gssd, we should ensure that the uid argument is encoded to match the user namespace of the daemon. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/auth_gss.c | 60 +++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 5dc2aef2232a..b2cbc83d39c7 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -269,6 +269,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct struct gss_upcall_msg { refcount_t count; kuid_t uid; + const char *service_name; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; @@ -316,6 +317,7 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); gss_put_auth(gss_msg->auth); + kfree_const(gss_msg->service_name); kfree(gss_msg); } @@ -410,10 +412,10 @@ gss_upcall_callback(struct rpc_task *task) gss_release_msg(gss_msg); } -static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) +static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg, + const struct cred *cred) { - struct user_namespace *userns = gss_msg->auth->client->cl_cred ? - gss_msg->auth->client->cl_cred->user_ns : &init_user_ns; + struct user_namespace *userns = cred->user_ns; uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); @@ -423,12 +425,24 @@ static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } +static ssize_t +gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg, + char __user *buf, size_t buflen) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, + struct gss_upcall_msg, + msg); + if (msg->copied == 0) + gss_encode_v0_msg(gss_msg, file->f_cred); + return rpc_pipe_generic_upcall(file, msg, buf, buflen); +} + static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, - const char *target_name) + const char *target_name, + const struct cred *cred) { - struct user_namespace *userns = gss_msg->auth->client->cl_cred ? - gss_msg->auth->client->cl_cred->user_ns : &init_user_ns; + struct user_namespace *userns = cred->user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); @@ -496,6 +510,25 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, return -ENOMEM; } +static ssize_t +gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg, + char __user *buf, size_t buflen) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, + struct gss_upcall_msg, + msg); + int err; + if (msg->copied == 0) { + err = gss_encode_v1_msg(gss_msg, + gss_msg->service_name, + gss_msg->auth->target_name, + file->f_cred); + if (err) + return err; + } + return rpc_pipe_generic_upcall(file, msg, buf, buflen); +} + static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, kuid_t uid, const char *service_name) @@ -518,16 +551,11 @@ gss_alloc_msg(struct gss_auth *gss_auth, refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; - switch (vers) { - case 0: - gss_encode_v0_msg(gss_msg); - break; - default: - err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); - if (err) + if (service_name) { + gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS); + if (!gss_msg->service_name) goto err_put_pipe_version; } - kref_get(&gss_auth->kref); return gss_msg; err_put_pipe_version: put_pipe_version(gss_auth->net); @@ -2120,7 +2148,7 @@ static const struct rpc_credops gss_nullops = { }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { - .upcall = rpc_pipe_generic_upcall, + .upcall = gss_v0_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, @@ -2128,7 +2156,7 @@ static const struct rpc_pipe_ops gss_upcall_ops_v0 = { }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { - .upcall = rpc_pipe_generic_upcall, + .upcall = gss_v1_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, From 264d948ce7d0b99cbe1aae704bbef8d951602eec Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:46 -0400 Subject: [PATCH 56/69] NFS: Convert NFSv3 to use the container user namespace When mapping NFS identities, we want to substitute for the uids and gids on the wire as we would for the AUTH_UNIX creds. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs3xdr.c | 142 ++++++++++++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 56 deletions(-) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 110358f4986d..abbbdde97e31 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -104,6 +104,20 @@ static const umode_t nfs_type2fmt[] = { [NF3FIFO] = S_IFIFO, }; +static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt) +{ + if (clnt && clnt->cl_cred) + return clnt->cl_cred->user_ns; + return &init_user_ns; +} + +static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp) +{ + if (rqstp->rq_task) + return rpc_userns(rqstp->rq_task->tk_client); + return &init_user_ns; +} + /* * Encode/decode NFSv3 basic data types * @@ -516,7 +530,8 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) * set_mtime mtime; * }; */ -static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, + struct user_namespace *userns) { struct timespec ts; u32 nbytes; @@ -551,13 +566,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) if (attr->ia_valid & ATTR_UID) { *p++ = xdr_one; - *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid)); } else *p++ = xdr_zero; if (attr->ia_valid & ATTR_GID) { *p++ = xdr_one; - *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid)); } else *p++ = xdr_zero; @@ -606,7 +621,8 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) * nfstime3 ctime; * }; */ -static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { umode_t fmode; __be32 *p; @@ -619,10 +635,10 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; fattr->nlink = be32_to_cpup(p++); - fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + fattr->uid = make_kuid(userns, be32_to_cpup(p++)); if (!uid_valid(fattr->uid)) goto out_uid; - fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + fattr->gid = make_kgid(userns, be32_to_cpup(p++)); if (!gid_valid(fattr->gid)) goto out_gid; @@ -659,7 +675,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) * void; * }; */ -static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { __be32 *p; @@ -667,7 +684,7 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) if (unlikely(!p)) return -EIO; if (*p != xdr_zero) - return decode_fattr3(xdr, fattr); + return decode_fattr3(xdr, fattr, userns); return 0; } @@ -728,14 +745,15 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) return 0; } -static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { int error; error = decode_pre_op_attr(xdr, fattr); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, fattr); + error = decode_post_op_attr(xdr, fattr, userns); out: return error; } @@ -837,7 +855,7 @@ static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, { const struct nfs3_sattrargs *args = data; encode_nfs_fh3(xdr, args->fh); - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req)); encode_sattrguard3(xdr, args); } @@ -998,13 +1016,14 @@ static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, * }; */ static void encode_createhow3(struct xdr_stream *xdr, - const struct nfs3_createargs *args) + const struct nfs3_createargs *args, + struct user_namespace *userns) { encode_uint32(xdr, args->createmode); switch (args->createmode) { case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_GUARDED: - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); break; case NFS3_CREATE_EXCLUSIVE: encode_createverf3(xdr, args->verifier); @@ -1021,7 +1040,7 @@ static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, const struct nfs3_createargs *args = data; encode_diropargs3(xdr, args->fh, args->name, args->len); - encode_createhow3(xdr, args); + encode_createhow3(xdr, args, rpc_rqst_userns(req)); } /* @@ -1039,7 +1058,7 @@ static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, const struct nfs3_mkdirargs *args = data; encode_diropargs3(xdr, args->fh, args->name, args->len); - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req)); } /* @@ -1056,11 +1075,12 @@ static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, * }; */ static void encode_symlinkdata3(struct xdr_stream *xdr, - const void *data) + const void *data, + struct user_namespace *userns) { const struct nfs3_symlinkargs *args = data; - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); encode_nfspath3(xdr, args->pages, args->pathlen); } @@ -1071,7 +1091,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, const struct nfs3_symlinkargs *args = data; encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); - encode_symlinkdata3(xdr, args); + encode_symlinkdata3(xdr, args, rpc_rqst_userns(req)); xdr->buf->flags |= XDRBUF_WRITE; } @@ -1100,24 +1120,26 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, * }; */ static void encode_devicedata3(struct xdr_stream *xdr, - const struct nfs3_mknodargs *args) + const struct nfs3_mknodargs *args, + struct user_namespace *userns) { - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); encode_specdata3(xdr, args->rdev); } static void encode_mknoddata3(struct xdr_stream *xdr, - const struct nfs3_mknodargs *args) + const struct nfs3_mknodargs *args, + struct user_namespace *userns) { encode_ftype3(xdr, args->type); switch (args->type) { case NF3CHR: case NF3BLK: - encode_devicedata3(xdr, args); + encode_devicedata3(xdr, args, userns); break; case NF3SOCK: case NF3FIFO: - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); break; case NF3REG: case NF3DIR: @@ -1134,7 +1156,7 @@ static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, const struct nfs3_mknodargs *args = data; encode_diropargs3(xdr, args->fh, args->name, args->len); - encode_mknoddata3(xdr, args); + encode_mknoddata3(xdr, args, rpc_rqst_userns(req)); } /* @@ -1379,7 +1401,7 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_fattr3(xdr, result); + error = decode_fattr3(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: @@ -1414,7 +1436,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result); + error = decode_wcc_data(xdr, result, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1449,6 +1471,7 @@ static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs3_diropres *result = data; enum nfs_stat status; int error; @@ -1461,14 +1484,14 @@ static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, error = decode_nfs_fh3(xdr, result->fh); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, userns); out: return error; out_default: - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; return nfs3_stat_to_errno(status); @@ -1504,7 +1527,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1545,7 +1568,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result); + error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1623,7 +1646,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; result->op_status = status; @@ -1694,7 +1717,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->fattr); + error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; result->op_status = status; @@ -1728,14 +1751,15 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, * }; */ static int decode_create3resok(struct xdr_stream *xdr, - struct nfs3_diropres *result) + struct nfs3_diropres *result, + struct user_namespace *userns) { int error; error = decode_post_op_fh3(xdr, result->fh); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; /* The server isn't required to return a file handle. @@ -1744,7 +1768,7 @@ static int decode_create3resok(struct xdr_stream *xdr, * values for the new object. */ if (result->fh->size == 0) result->fattr->valid = 0; - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, userns); out: return error; } @@ -1753,6 +1777,7 @@ static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs3_diropres *result = data; enum nfs_stat status; int error; @@ -1762,11 +1787,11 @@ static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_create3resok(xdr, result); + error = decode_create3resok(xdr, result, userns); out: return error; out_default: - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; return nfs3_stat_to_errno(status); @@ -1801,7 +1826,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1836,6 +1861,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs_renameres *result = data; enum nfs_stat status; int error; @@ -1843,10 +1869,10 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->old_fattr); + error = decode_wcc_data(xdr, result->old_fattr, userns); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->new_fattr); + error = decode_wcc_data(xdr, result->new_fattr, userns); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1880,6 +1906,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs3_linkres *result = data; enum nfs_stat status; int error; @@ -1887,10 +1914,10 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1939,6 +1966,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, bool plus) { + struct user_namespace *userns = rpc_userns(entry->server->client); struct nfs_entry old = *entry; __be32 *p; int error; @@ -1973,7 +2001,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (plus) { entry->fattr->valid = 0; - error = decode_post_op_attr(xdr, entry->fattr); + error = decode_post_op_attr(xdr, entry->fattr, userns); if (unlikely(error)) return error; if (entry->fattr->valid & NFS_ATTR_FATTR_V3) @@ -2045,11 +2073,12 @@ static int decode_dirlist3(struct xdr_stream *xdr) } static int decode_readdir3resok(struct xdr_stream *xdr, - struct nfs3_readdirres *result) + struct nfs3_readdirres *result, + struct user_namespace *userns) { int error; - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; /* XXX: do we need to check if result->verf != NULL ? */ @@ -2074,11 +2103,11 @@ static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_readdir3resok(xdr, result); + error = decode_readdir3resok(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; return nfs3_stat_to_errno(status); @@ -2138,7 +2167,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -2212,7 +2241,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -2273,7 +2302,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -2315,7 +2344,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->fattr); + error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; result->op_status = status; @@ -2331,14 +2360,15 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, #ifdef CONFIG_NFS_V3_ACL static inline int decode_getacl3resok(struct xdr_stream *xdr, - struct nfs3_getaclres *result) + struct nfs3_getaclres *result, + struct user_namespace *userns) { struct posix_acl **acl; unsigned int *aclcnt; size_t hdrlen; int error; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; error = decode_uint32(xdr, &result->mask); @@ -2386,7 +2416,7 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_getacl3resok(xdr, result); + error = decode_getacl3resok(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: @@ -2405,7 +2435,7 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_post_op_attr(xdr, result); + error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: From 58002399da65c53f00987623d9ff7c3c2773a0d9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:47 -0400 Subject: [PATCH 57/69] NFSv4: Convert the NFS client idmapper to use the container user namespace When mapping NFS identities using the NFSv4 idmapper, we want to substitute for the uids and gids that would normally go on the wire as part of a NFSv3 request. So we use the same mapping in the NFSv4 upcall as we use in the NFSv3 RPC call (i.e. the mapping stored in the rpc_clnt cred). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4idmap.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index bf34ddaa2ad7..4884fdae28fb 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -69,8 +69,16 @@ struct idmap { struct rpc_pipe *idmap_pipe; struct idmap_legacy_upcalldata *idmap_upcall_data; struct mutex idmap_mutex; + const struct cred *cred; }; +static struct user_namespace *idmap_userns(const struct idmap *idmap) +{ + if (idmap && idmap->cred) + return idmap->cred->user_ns; + return &init_user_ns; +} + /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields * @fattr: fully initialised struct nfs_fattr @@ -271,14 +279,15 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, const char *type, struct idmap *idmap) { char *desc; - struct key *rkey; + struct key *rkey = ERR_PTR(-EAGAIN); ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); if (ret < 0) return ERR_PTR(ret); - rkey = request_key(&key_type_id_resolver, desc, ""); + if (!idmap->cred || idmap->cred->user_ns == &init_user_ns) + rkey = request_key(&key_type_id_resolver, desc, ""); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, @@ -452,6 +461,9 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; + mutex_init(&idmap->idmap_mutex); + idmap->cred = get_cred(clp->cl_rpcclient->cl_cred); + rpc_init_pipe_dir_object(&idmap->idmap_pdo, &nfs_idmap_pipe_dir_object_ops, idmap); @@ -462,7 +474,6 @@ nfs_idmap_new(struct nfs_client *clp) goto err; } idmap->idmap_pipe = pipe; - mutex_init(&idmap->idmap_mutex); error = rpc_add_pipe_dir_object(clp->cl_net, &clp->cl_rpcclient->cl_pipedir_objects, @@ -475,6 +486,7 @@ nfs_idmap_new(struct nfs_client *clp) err_destroy_pipe: rpc_destroy_pipe_data(idmap->idmap_pipe); err: + put_cred(idmap->cred); kfree(idmap); return error; } @@ -491,6 +503,7 @@ nfs_idmap_delete(struct nfs_client *clp) &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); rpc_destroy_pipe_data(idmap->idmap_pipe); + put_cred(idmap->cred); kfree(idmap); } @@ -735,7 +748,7 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_ if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); if (ret == 0) { - *uid = make_kuid(&init_user_ns, id); + *uid = make_kuid(idmap_userns(idmap), id); if (!uid_valid(*uid)) ret = -ERANGE; } @@ -752,7 +765,7 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); if (ret == 0) { - *gid = make_kgid(&init_user_ns, id); + *gid = make_kgid(idmap_userns(idmap), id); if (!gid_valid(*gid)) ret = -ERANGE; } @@ -766,7 +779,7 @@ int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, int ret = -EINVAL; __u32 id; - id = from_kuid(&init_user_ns, uid); + id = from_kuid_munged(idmap_userns(idmap), uid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) @@ -780,7 +793,7 @@ int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, int ret = -EINVAL; __u32 id; - id = from_kgid(&init_user_ns, gid); + id = from_kgid_munged(idmap_userns(idmap), gid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) From c207db2f5da5e37e284d87d5196dcf967e84956c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:48 -0400 Subject: [PATCH 58/69] NFS: Convert NFSv2 to use the container user namespace When mapping NFS identities, we want to substitute for the uids and gids on the wire as we would for the AUTH_UNIX creds. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs2xdr.c | 58 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index a7ed29de0a40..572794dab4b1 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -76,6 +76,20 @@ static int nfs_stat_to_errno(enum nfs_stat); * or decoded inline. */ +static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt) +{ + if (clnt && clnt->cl_cred) + return clnt->cl_cred->user_ns; + return &init_user_ns; +} + +static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp) +{ + if (rqstp->rq_task) + return rpc_userns(rqstp->rq_task->tk_client); + return &init_user_ns; +} + /* * typedef opaque nfsdata<>; */ @@ -248,7 +262,8 @@ static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) * }; * */ -static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { u32 rdev, type; __be32 *p; @@ -263,10 +278,10 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = be32_to_cpup(p++); fattr->nlink = be32_to_cpup(p++); - fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + fattr->uid = make_kuid(userns, be32_to_cpup(p++)); if (!uid_valid(fattr->uid)) goto out_uid; - fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + fattr->gid = make_kgid(userns, be32_to_cpup(p++)); if (!gid_valid(fattr->gid)) goto out_gid; @@ -321,7 +336,8 @@ static __be32 *xdr_time_not_set(__be32 *p) return p; } -static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr, + struct user_namespace *userns) { struct timespec ts; __be32 *p; @@ -333,11 +349,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_UID) - *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_GID) - *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_SIZE) @@ -451,7 +467,8 @@ static int decode_path(struct xdr_stream *xdr) * }; */ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result, - __u32 *op_status) + __u32 *op_status, + struct user_namespace *userns) { enum nfs_stat status; int error; @@ -463,7 +480,7 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result, *op_status = status; if (status != NFS_OK) goto out_default; - error = decode_fattr(xdr, result); + error = decode_fattr(xdr, result, userns); out: return error; out_default: @@ -498,19 +515,21 @@ static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, * void; * }; */ -static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result, + struct user_namespace *userns) { int error; error = decode_fhandle(xdr, result->fh); if (unlikely(error)) goto out; - error = decode_fattr(xdr, result->fattr); + error = decode_fattr(xdr, result->fattr, userns); out: return error; } -static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result, + struct user_namespace *userns) { enum nfs_stat status; int error; @@ -520,7 +539,7 @@ static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) goto out; if (status != NFS_OK) goto out_default; - error = decode_diropok(xdr, result); + error = decode_diropok(xdr, result, userns); out: return error; out_default: @@ -559,7 +578,7 @@ static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, const struct nfs_sattrargs *args = data; encode_fhandle(xdr, args->fh); - encode_sattr(xdr, args->sattr); + encode_sattr(xdr, args->sattr, rpc_rqst_userns(req)); } static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, @@ -674,7 +693,7 @@ static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, const struct nfs_createargs *args = data; encode_diropargs(xdr, args->fh, args->name, args->len); - encode_sattr(xdr, args->sattr); + encode_sattr(xdr, args->sattr, rpc_rqst_userns(req)); } static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, @@ -741,7 +760,7 @@ static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); encode_path(xdr, args->pages, args->pathlen); - encode_sattr(xdr, args->sattr); + encode_sattr(xdr, args->sattr, rpc_rqst_userns(req)); } /* @@ -803,13 +822,13 @@ static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, void *result) { - return decode_attrstat(xdr, result, NULL); + return decode_attrstat(xdr, result, NULL, rpc_rqst_userns(req)); } static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, void *result) { - return decode_diropres(xdr, result); + return decode_diropres(xdr, result, rpc_rqst_userns(req)); } /* @@ -864,7 +883,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS_OK) goto out_default; - error = decode_fattr(xdr, result->fattr); + error = decode_fattr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; error = decode_nfsdata(xdr, result); @@ -881,7 +900,8 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC; - return decode_attrstat(xdr, result->fattr, &result->op_status); + return decode_attrstat(xdr, result->fattr, &result->op_status, + rpc_rqst_userns(req)); } /** From 3b7eb5e35d0f55541452e91ca66798c16a81ad4a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:49 -0400 Subject: [PATCH 59/69] NFS: When mounting, don't share filesystems between different user namespaces If two different containers that share the same network namespace attempt to mount the same filesystem, we should not allow them to share the same super block if they do not share the same user namespace, since the user mappings on the wire will need to differ. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/super.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 19783e8ba9fb..4f014c4c7bc1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2485,6 +2485,21 @@ static int nfs_compare_super_address(struct nfs_server *server1, return 1; } +static int nfs_compare_userns(const struct nfs_server *old, + const struct nfs_server *new) +{ + const struct user_namespace *oldns = &init_user_ns; + const struct user_namespace *newns = &init_user_ns; + + if (old->client && old->client->cl_cred) + oldns = old->client->cl_cred->user_ns; + if (new->client && new->client->cl_cred) + newns = new->client->cl_cred->user_ns; + if (oldns != newns) + return 0; + return 1; +} + static int nfs_compare_super(struct super_block *sb, void *data) { struct nfs_sb_mountdata *sb_mntdata = data; @@ -2498,6 +2513,8 @@ static int nfs_compare_super(struct super_block *sb, void *data) return 0; if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) return 0; + if (!nfs_compare_userns(old, server)) + return 0; return nfs_compare_mount_options(sb, server, mntflags); } From b422df915cef80333d7a1732e6ed81f41db12b79 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Apr 2019 17:46:50 -0400 Subject: [PATCH 60/69] lockd: Store the lockd client credential in struct nlm_host When we create a new lockd client, we want to be able to pass the correct credential of the process that created the struct nlm_host. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/lockd/clntlock.c | 2 +- fs/lockd/host.c | 11 +++++++++-- include/linux/lockd/bind.h | 1 + include/linux/lockd/lockd.h | 4 +++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index c2a128678e6e..70f520b41a19 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -63,7 +63,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, nlm_init->hostname, nlm_init->noresvport, - nlm_init->net); + nlm_init->net, nlm_init->cred); if (host == NULL) goto out_nohost; if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index d46081123f7c..7d46fafdbbe5 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -60,6 +60,7 @@ struct nlm_lookup_host_info { const size_t hostname_len; /* it's length */ const int noresvport; /* use non-priv port */ struct net *net; /* network namespace to bind */ + const struct cred *cred; }; /* @@ -162,6 +163,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_nsmhandle = nsm; host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; + host->h_cred = get_cred(ni->cred), strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: @@ -188,6 +190,7 @@ static void nlm_destroy_host_locked(struct nlm_host *host) clnt = host->h_rpcclnt; if (clnt != NULL) rpc_shutdown_client(clnt); + put_cred(host->h_cred); kfree(host); ln->nrhosts--; @@ -202,6 +205,8 @@ static void nlm_destroy_host_locked(struct nlm_host *host) * @version: NLM protocol version * @hostname: '\0'-terminated hostname of server * @noresvport: 1 if non-privileged port should be used + * @net: pointer to net namespace + * @cred: pointer to cred * * Returns an nlm_host structure that matches the passed-in * [server address, transport protocol, NLM version, server hostname]. @@ -214,7 +219,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const u32 version, const char *hostname, int noresvport, - struct net *net) + struct net *net, + const struct cred *cred) { struct nlm_lookup_host_info ni = { .server = 0, @@ -226,6 +232,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .hostname_len = strlen(hostname), .noresvport = noresvport, .net = net, + .cred = cred, }; struct hlist_head *chain; struct nlm_host *host; @@ -458,7 +465,7 @@ nlm_bind_host(struct nlm_host *host) .authflavor = RPC_AUTH_UNIX, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_AUTOBIND), - .cred = current_cred(), + .cred = host->h_cred, }; /* diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 053a4ef3d431..8c0cf1059443 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -46,6 +46,7 @@ struct nlmclnt_initdata { int noresvport; struct net *net; const struct nlmclnt_operations *nlmclnt_ops; + const struct cred *cred; }; /* diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b065ef406770..c9b422dde542 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -70,6 +70,7 @@ struct nlm_host { struct nsm_handle *h_nsmhandle; /* NSM status handle */ char *h_addrbuf; /* address eyecatcher */ struct net *net; /* host net */ + const struct cred *h_cred; char nodename[UNX_MAXNODENAME + 1]; const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ }; @@ -229,7 +230,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const u32 version, const char *hostname, int noresvport, - struct net *net); + struct net *net, + const struct cred *cred); void nlmclnt_release_host(struct nlm_host *); struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, const char *hostname, From 950a578c6128c2886e295b9c7ecb0b6b22fcc92b Mon Sep 17 00:00:00 2001 From: Roberto Bergantinos Corpas Date: Thu, 25 Apr 2019 15:36:51 +0200 Subject: [PATCH 61/69] NFS: make nfs_match_client killable Actually we don't do anything with return value from nfs_wait_client_init_complete in nfs_match_client, as a consequence if we get a fatal signal and client is not fully initialised, we'll loop to "again" label This has been proven to cause soft lockups on some scenarios (no-carrier but configured network interfaces) Signed-off-by: Roberto Bergantinos Corpas Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e3baa9f1da76..06e8719655f0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -284,6 +284,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat struct nfs_client *clp; const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); + int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { @@ -296,8 +297,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); - nfs_wait_client_init_complete(clp); + error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); + if (error < 0) + return ERR_PTR(error); spin_lock(&nn->nfs_client_lock); goto again; } @@ -407,6 +410,8 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); + if (IS_ERR(clp)) + return clp; if (new) new->rpc_ops->free_client(new); return nfs_found_client(cl_init, clp); From b1029c9bc078a6f1515f55dd993b507dcc7e3440 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 7 May 2019 13:41:49 -0400 Subject: [PATCH 62/69] PNFS fallback to MDS if no deviceid found If we fail to find a good deviceid while trying to pnfs instead of propogating an error back fallback to doing IO to the MDS. Currently, code with fals the IO with EINVAL. Signed-off-by: Olga Kornievskaia Fixes: 8d40b0f14846f ("NFS filelayout:call GETDEVICEINFO after pnfs_layout_process completes" Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 21d9f3bfbc81..3cb073c50fa6 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -904,7 +904,7 @@ fl_pnfs_update_layout(struct inode *ino, status = filelayout_check_deviceid(lo, fl, gfp_flags); if (status) { pnfs_put_lseg(lseg); - lseg = ERR_PTR(status); + lseg = NULL; } out: return lseg; From f02f3755dbd14fb935d24b14650fff9ba92243b8 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Mon, 6 May 2019 11:57:03 +0800 Subject: [PATCH 63/69] NFS4: Fix v4.0 client state corruption when mount stat command with soft mount never return after server is stopped. When alloc a new client, the state of the client will be set to NFS4CLNT_LEASE_EXPIRED. When the server is stopped, the state manager will work, and accord the state to recover. But the state is NFS4CLNT_LEASE_EXPIRED, it will drain the slot table and lead other task to wait queue, until the client recovered. Then the stat command is hung. When discover server trunking, the client will renew the lease, but check the client state, it lead the client state corruption. So, we need to call state manager to recover it when detect server ip trunking. Signed-off-by: ZhangXiaoxu Cc: stable@vger.kernel.org Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3de36479ed7a..f502f1c054cf 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -159,6 +159,10 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); + + /* If the client state need to recover, do it. */ + if (clp->cl_state) + nfs4_schedule_state_manager(clp); } out: return status; From 9c5948c248696ca60c56ec5a608e225c4ab8a854 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Mon, 29 Apr 2019 17:32:31 +0800 Subject: [PATCH 64/69] SUNRPC: task should be exit if encode return EKEYEXPIRED more times If the rpc.gssd always return cred success, but now the cred is expired, then the task will loop in call_refresh and call_transmit. Exit the rpc task after retry. Signed-off-by: ZhangXiaoxu Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 369a2648dafc..c1f1afabd024 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1814,7 +1814,14 @@ call_encode(struct rpc_task *task) rpc_delay(task, HZ >> 4); break; case -EKEYEXPIRED: - task->tk_action = call_refresh; + if (!task->tk_cred_retry) { + rpc_exit(task, task->tk_status); + } else { + task->tk_action = call_refresh; + task->tk_cred_retry--; + dprintk("RPC: %5u %s: retry refresh creds\n", + task->tk_pid, __func__); + } break; default: rpc_call_rpcerror(task, task->tk_status); From fe31ce83cbac7adcaa629b59179f502981be5f8b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 May 2019 15:30:09 +0300 Subject: [PATCH 65/69] SUNRPC: Fix an error code in gss_alloc_msg() If kstrdup_const() then this function returns zero (success) but it should return -ENOMEM. Fixes: ac83228a7101 ("SUNRPC: Use namespace of listening daemon in the client AUTH_GSS upcall") Signed-off-by: Dan Carpenter Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/auth_gss.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index b2cbc83d39c7..06fe17c2aea1 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -553,8 +553,10 @@ gss_alloc_msg(struct gss_auth *gss_auth, gss_msg->auth = gss_auth; if (service_name) { gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS); - if (!gss_msg->service_name) + if (!gss_msg->service_name) { + err = -ENOMEM; goto err_put_pipe_version; + } } return gss_msg; err_put_pipe_version: From 8ca017c8cee3aa6a37ddf1db7fd04c54536a0ef0 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Mon, 6 May 2019 11:59:05 -0400 Subject: [PATCH 66/69] NFSv4: don't mark all open state for recovery when handling recallable state revoked flag Only delegations and layouts can be recalled, so it shouldn't be necessary to recover all opens when handling the status bit SEQ4_STATUS_RECALLABLE_STATE_REVOKED. We'll still wind up calling nfs41_open_expired() when a TEST_STATEID returns NFS4ERR_DELEG_REVOKED. Signed-off-by: Scott Mayhew Reviewed-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/delegation.c | 12 ++++++++++++ fs/nfs/delegation.h | 1 + fs/nfs/nfs4state.c | 3 +-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 2f6b447cdd82..8b78274e3e56 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1033,6 +1033,18 @@ void nfs_mark_test_expired_all_delegations(struct nfs_client *clp) rcu_read_unlock(); } +/** + * nfs_test_expired_all_delegations - test all delegations for a client + * @clp: nfs_client to process + * + * Helper for handling "recallable state revoked" status from server. + */ +void nfs_test_expired_all_delegations(struct nfs_client *clp) +{ + nfs_mark_test_expired_all_delegations(clp); + nfs4_schedule_state_manager(clp); +} + /** * nfs_reap_expired_delegations - reap expired delegations * @clp: nfs_client to process diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 35b4b02c1ae0..5799777df5ec 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -58,6 +58,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); void nfs_mark_test_expired_all_delegations(struct nfs_client *clp); +void nfs_test_expired_all_delegations(struct nfs_client *clp); void nfs_reap_expired_delegations(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f502f1c054cf..e2e3c4f04d3e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2350,8 +2350,7 @@ static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { /* FIXME: For now, we destroy all layouts. */ pnfs_destroy_all_layouts(clp); - /* FIXME: For now, we test all delegations+open state+locks. */ - nfs41_handle_some_state_revoked(clp); + nfs_test_expired_all_delegations(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } From a46126ccc77e764429d63bf958d117f607f4b6c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 May 2019 12:06:35 -0400 Subject: [PATCH 67/69] nfs: pass the correct prototype to read_cache_page Fix the callbacks NFS passes to read_cache_page to actually have the proper type expected. Casting around function pointers can easily hide typing bugs, and defeats control flow protection. Signed-off-by: Christoph Hellwig Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 7 ++++--- fs/nfs/symlink.c | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a71d0b42d160..47d445bec8c9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -714,8 +714,9 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, * We only need to convert from xdr once so future lookups are much simpler */ static -int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) +int nfs_readdir_filler(void *data, struct page* page) { + nfs_readdir_descriptor_t *desc = data; struct inode *inode = file_inode(desc->file); int ret; @@ -762,8 +763,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - return read_cache_page(desc->file->f_mapping, - desc->page_index, (filler_t *)nfs_readdir_filler, desc); + return read_cache_page(desc->file->f_mapping, desc->page_index, + nfs_readdir_filler, desc); } /* diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 06eb44b47885..25ba299fdac2 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -26,8 +26,9 @@ * and straight-forward than readdir caching. */ -static int nfs_symlink_filler(struct inode *inode, struct page *page) +static int nfs_symlink_filler(void *data, struct page *page) { + struct inode *inode = data; int error; error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE); @@ -65,8 +66,8 @@ static const char *nfs_get_link(struct dentry *dentry, err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) return err; - page = read_cache_page(&inode->i_data, 0, - (filler_t *)nfs_symlink_filler, inode); + page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler, + inode); if (IS_ERR(page)) return ERR_CAST(page); } From c260121a97a3e4df6536edbc2f26e166eff370ce Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 9 May 2019 07:25:21 -0400 Subject: [PATCH 68/69] NFS: Fix a double unlock from nfs_match,get_client Now that nfs_match_client drops the nfs_client_lock, we should be careful to always return it in the same condition: locked. Fixes: 950a578c6128 ("NFS: make nfs_match_client killable") Reported-by: syzbot+228a82b263b5da91883d@syzkaller.appspotmail.com Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 06e8719655f0..da74c4c4a244 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -299,9 +299,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); - spin_lock(&nn->nfs_client_lock); goto again; } From 5940d1cf9f42f67e9cc3f7df9eda39f5888d6e9e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 9 May 2019 11:00:07 -0400 Subject: [PATCH 69/69] SUNRPC: Rebalance a kref in auth_gss.c Restore the kref_get that matches the gss_put_auth(gss_msg->auth) done by gss_release_msg(). Fixes: ac83228a7101 ("SUNRPC: Use namespace of listening daemon ...") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/auth_gss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 06fe17c2aea1..4ce42c62458e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -551,6 +551,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; + kref_get(&gss_auth->kref); if (service_name) { gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS); if (!gss_msg->service_name) {