From 9eb190fca8f9056ea4502526dc55fe52318d9afc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 20 Jul 2018 18:19:17 -0400 Subject: [PATCH 01/30] NFSD CB_OFFLOAD xdr Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 98 ++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 1 + fs/nfsd/xdr4.h | 6 +++ fs/nfsd/xdr4cb.h | 10 +++++ 4 files changed, 115 insertions(+) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 601bf33c26a0..25987bcdf96f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,7 @@ #include "state.h" #include "netns.h" #include "xdr4cb.h" +#include "xdr4.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -105,6 +106,7 @@ enum nfs_cb_opnum4 { OP_CB_WANTS_CANCELLED = 12, OP_CB_NOTIFY_LOCK = 13, OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_OFFLOAD = 15, OP_CB_ILLEGAL = 10044 }; @@ -682,6 +684,101 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } +/* + * struct write_response4 { + * stateid4 wr_callback_id<1>; + * length4 wr_count; + * stable_how4 wr_committed; + * verifier4 wr_writeverf; + * }; + * union offload_info4 switch (nfsstat4 coa_status) { + * case NFS4_OK: + * write_response4 coa_resok4; + * default: + * length4 coa_bytes_copied; + * }; + * struct CB_OFFLOAD4args { + * nfs_fh4 coa_fh; + * stateid4 coa_stateid; + * offload_info4 coa_offload_info; + * }; + */ +static void encode_offload_info4(struct xdr_stream *xdr, + __be32 nfserr, + const struct nfsd4_copy *cp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = nfserr; + if (!nfserr) { + p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_encode_empty_array(p); + p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written); + *p++ = cpu_to_be32(cp->cp_res.wr_stable_how); + p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data, + NFS4_VERIFIER_SIZE); + } else { + p = xdr_reserve_space(xdr, 8); + /* We always return success if bytes were written */ + p = xdr_encode_hyper(p, 0); + } +} + +static void encode_cb_offload4args(struct xdr_stream *xdr, + __be32 nfserr, + const struct knfsd_fh *fh, + const struct nfsd4_copy *cp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = cpu_to_be32(OP_CB_OFFLOAD); + encode_nfs_fh4(xdr, fh); + encode_stateid4(xdr, &cp->cp_res.cb_stateid); + encode_offload_info4(xdr, nfserr, cp); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfsd4_copy *cp = + container_of(cb, struct nfsd4_copy, cp_cb); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); +} /* * RPC procedure tables */ @@ -703,6 +800,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), + PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0b15dac7e609..6e38d9927448 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -573,6 +573,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, + NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 17c453a7999c..b7c34f4a1222 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -511,6 +511,7 @@ struct nfsd42_write_res { u64 wr_bytes_written; u32 wr_stable_how; nfs4_verifier wr_verifier; + stateid_t cb_stateid; }; struct nfsd4_copy { @@ -526,6 +527,11 @@ struct nfsd4_copy { /* response */ struct nfsd42_write_res cp_res; + + /* for cb_offload */ + struct nfsd4_callback cp_cb; + __be32 nfserr; + struct knfsd_fh fh; }; struct nfsd4_seek { diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 517239af0302..547cf07cf4e0 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -38,3 +38,13 @@ #define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + enc_nfs4_fh_sz + \ + enc_stateid_sz + \ + enc_cb_offload_info_sz) +#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) From 6308bc98e86ee8c7bbd56a39839a257a16c9378c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 20 Jul 2018 18:19:18 -0400 Subject: [PATCH 02/30] NFSD OFFLOAD_STATUS xdr Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 20 ++++++++++++++++++++ fs/nfsd/nfs4xdr.c | 27 +++++++++++++++++++++++++-- fs/nfsd/xdr4.h | 10 ++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b7bc6e1a85ac..aa05aa3947e0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1144,6 +1144,13 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput(file); return status; } +static __be32 +nfsd4_offload_status(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + return nfserr_notsupp; +} static __be32 nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -2047,6 +2054,14 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1 /* cr_synchronous */) * sizeof(__be32); } +static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 2 /* osr_count */ + + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { @@ -2460,6 +2475,11 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SEEK", .op_rsize_bop = nfsd4_seek_rsize, }, + [OP_OFFLOAD_STATUS] = { + .op_func = nfsd4_offload_status, + .op_name = "OP_OFFLOAD_STATUS", + .op_rsize_bop = nfsd4_offload_status_rsize, + }, }; /** diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 418fa9c78186..67b7e6f19ed6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1767,6 +1767,13 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) DECODE_TAIL; } +static __be32 +nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, + struct nfsd4_offload_status *os) +{ + return nfsd4_decode_stateid(argp, &os->stateid); +} + static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { @@ -1874,7 +1881,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4256,6 +4263,22 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, return 0; } +static __be32 +nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_offload_status *os) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, os->count); + *p++ = cpu_to_be32(0); + + return nfserr; +} + static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) @@ -4359,7 +4382,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = { [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index b7c34f4a1222..06cf218944c5 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -545,6 +545,15 @@ struct nfsd4_seek { loff_t seek_pos; }; +struct nfsd4_offload_status { + /* request */ + stateid_t stateid; + + /* response */ + u64 count; + u32 status; +}; + struct nfsd4_op { int opnum; const struct nfsd4_operation * opdesc; @@ -603,6 +612,7 @@ struct nfsd4_op { struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; struct nfsd4_copy copy; + struct nfsd4_offload_status offload_status; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; From 885e2bf3ea5121975ade0d7866ab6226a8547dc9 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 20 Jul 2018 18:19:19 -0400 Subject: [PATCH 03/30] NFSD OFFLOAD_CANCEL xdr Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 14 ++++++++++++++ fs/nfsd/nfs4xdr.c | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index aa05aa3947e0..0c7832321010 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1122,6 +1122,14 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +static __be32 +nfsd4_offload_cancel(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + return 0; +} + static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) @@ -2480,6 +2488,12 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_OFFLOAD_STATUS", .op_rsize_bop = nfsd4_offload_status_rsize, }, + [OP_OFFLOAD_CANCEL] = { + .op_func = nfsd4_offload_cancel, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OFFLOAD_CANCEL", + .op_rsize_bop = nfsd4_only_status_rsize, + }, }; /** diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 67b7e6f19ed6..b78280a8af73 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1880,7 +1880,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, From e0639dc5805a9d4faaa2c07ad98fa853b9529dd3 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 20 Jul 2018 18:19:20 -0400 Subject: [PATCH 04/30] NFSD introduce async copy feature Upon receiving a request for async copy, create a new kthread. If we get asynchronous request, make sure to copy the needed arguments/state from the stack before starting the copy. Then start the thread and reply back to the client indicating copy is asynchronous. nfsd_copy_file_range() will copy in a loop over the total number of bytes is needed to copy. In case a failure happens in the middle, we ignore the error and return how much we copied so far. Once done creating a workitem for the callback workqueue and send CB_OFFLOAD with the results. The lifetime of the copy stateid is bound to the vfs copy. This way we don't need to keep the nfsd_net structure for the callback. We could keep it around longer so that an OFFLOAD_STATUS that came late would still get results, but clients should be able to deal without that. We handle OFFLOAD_CANCEL by sending a signal to the copy thread and calling kthread_stop. A client should cancel any ongoing copies before calling DESTROY_CLIENT; if not, we return a CLIENT_BUSY error. If the client is destroyed for some other reason (lease expiration, or server shutdown), we must clean up any ongoing copies ourselves. Signed-off-by: Olga Kornievskaia [colin.king@canonical.com: fix leak in error case] [bfields@fieldses.org: remove signalling, merge patches] Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 8 ++ fs/nfsd/nfs4proc.c | 261 ++++++++++++++++++++++++++++++++++++++++---- fs/nfsd/nfs4state.c | 38 ++++++- fs/nfsd/nfs4xdr.c | 23 +++- fs/nfsd/nfsctl.c | 1 + fs/nfsd/state.h | 9 ++ fs/nfsd/xdr4.h | 12 ++ 7 files changed, 327 insertions(+), 25 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 426f55005697..32cb8c027483 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -123,6 +123,14 @@ struct nfsd_net { wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; + + /* + * clientid and stateid data for construction of net unique COPY + * stateids. + */ + u32 s2s_cp_cl_id; + struct idr s2s_cp_stateids; + spinlock_t s2s_cp_lock; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0c7832321010..edff074d38c7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "idmap.h" #include "cache.h" @@ -1089,37 +1090,236 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +void nfs4_put_copy(struct nfsd4_copy *copy) +{ + if (!refcount_dec_and_test(©->refcount)) + return; + kfree(copy); +} + +static bool +check_and_set_stop_copy(struct nfsd4_copy *copy) +{ + bool value; + + spin_lock(©->cp_clp->async_lock); + value = copy->stopped; + if (!copy->stopped) + copy->stopped = true; + spin_unlock(©->cp_clp->async_lock); + return value; +} + +static void nfsd4_stop_copy(struct nfsd4_copy *copy) +{ + /* only 1 thread should stop the copy */ + if (!check_and_set_stop_copy(copy)) + kthread_stop(copy->copy_task); + nfs4_put_copy(copy); +} + +static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy = NULL; + + spin_lock(&clp->async_lock); + if (!list_empty(&clp->async_copies)) { + copy = list_first_entry(&clp->async_copies, struct nfsd4_copy, + copies); + refcount_inc(©->refcount); + } + spin_unlock(&clp->async_lock); + return copy; +} + +void nfsd4_shutdown_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy; + + while ((copy = nfsd4_get_copy(clp)) != NULL) + nfsd4_stop_copy(copy); +} + +static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) +{ + struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb); + + nfs4_put_copy(copy); +} + +static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + return 1; +} + +static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { + .release = nfsd4_cb_offload_release, + .done = nfsd4_cb_offload_done +}; + +static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) +{ + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_synchronous = sync; + gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); +} + +static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) +{ + ssize_t bytes_copied = 0; + size_t bytes_total = copy->cp_count; + u64 src_pos = copy->cp_src_pos; + u64 dst_pos = copy->cp_dst_pos; + + do { + if (kthread_should_stop()) + break; + bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, + copy->file_dst, dst_pos, bytes_total); + if (bytes_copied <= 0) + break; + bytes_total -= bytes_copied; + copy->cp_res.wr_bytes_written += bytes_copied; + src_pos += bytes_copied; + dst_pos += bytes_copied; + } while (bytes_total > 0 && !copy->cp_synchronous); + return bytes_copied; +} + +static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) +{ + __be32 status; + ssize_t bytes; + + bytes = _nfsd_copy_file_range(copy); + /* for async copy, we ignore the error, client can always retry + * to get the error + */ + if (bytes < 0 && !copy->cp_res.wr_bytes_written) + status = nfserrno(bytes); + else { + nfsd4_init_copy_res(copy, sync); + status = nfs_ok; + } + + fput(copy->file_src); + fput(copy->file_dst); + return status; +} + +static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) +{ + dst->cp_src_pos = src->cp_src_pos; + dst->cp_dst_pos = src->cp_dst_pos; + dst->cp_count = src->cp_count; + dst->cp_synchronous = src->cp_synchronous; + memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); + memcpy(&dst->fh, &src->fh, sizeof(src->fh)); + dst->cp_clp = src->cp_clp; + dst->file_dst = get_file(src->file_dst); + dst->file_src = get_file(src->file_src); + memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); +} + +static void cleanup_async_copy(struct nfsd4_copy *copy) +{ + nfs4_free_cp_state(copy); + fput(copy->file_dst); + fput(copy->file_src); + spin_lock(©->cp_clp->async_lock); + list_del(©->copies); + spin_unlock(©->cp_clp->async_lock); + nfs4_put_copy(copy); +} + +static int nfsd4_do_async_copy(void *data) +{ + struct nfsd4_copy *copy = (struct nfsd4_copy *)data; + struct nfsd4_copy *cb_copy; + + copy->nfserr = nfsd4_do_copy(copy, 0); + cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!cb_copy) + goto out; + memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); + cb_copy->cp_clp = copy->cp_clp; + cb_copy->nfserr = copy->nfserr; + memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); + nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, + &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd4_run_cb(&cb_copy->cp_cb); +out: + cleanup_async_copy(copy); + return 0; +} + static __be32 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_copy *copy = &u->copy; - struct file *src, *dst; __be32 status; - ssize_t bytes; + struct nfsd4_copy *async_copy = NULL; - status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, - ©->cp_dst_stateid, &dst); + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, + ©->file_src, ©->cp_dst_stateid, + ©->file_dst); if (status) goto out; - bytes = nfsd_copy_file_range(src, copy->cp_src_pos, - dst, copy->cp_dst_pos, copy->cp_count); + copy->cp_clp = cstate->clp; + memcpy(©->fh, &cstate->current_fh.fh_handle, + sizeof(struct knfsd_fh)); + if (!copy->cp_synchronous) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - if (bytes < 0) - status = nfserrno(bytes); - else { - copy->cp_res.wr_bytes_written = bytes; - copy->cp_res.wr_stable_how = NFS_UNSTABLE; - copy->cp_synchronous = 1; - gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfserrno(-ENOMEM); + async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!async_copy) + goto out; + if (!nfs4_init_cp_state(nn, copy)) { + kfree(async_copy); + goto out; + } + refcount_set(&async_copy->refcount, 1); + memcpy(©->cp_res.cb_stateid, ©->cp_stateid, + sizeof(copy->cp_stateid)); + dup_copy_fields(copy, async_copy); + async_copy->copy_task = kthread_create(nfsd4_do_async_copy, + async_copy, "%s", "copy thread"); + if (IS_ERR(async_copy->copy_task)) + goto out_err; + spin_lock(&async_copy->cp_clp->async_lock); + list_add(&async_copy->copies, + &async_copy->cp_clp->async_copies); + spin_unlock(&async_copy->cp_clp->async_lock); + wake_up_process(async_copy->copy_task); status = nfs_ok; - } - - fput(src); - fput(dst); + } else + status = nfsd4_do_copy(copy, 1); out: return status; +out_err: + cleanup_async_copy(async_copy); + goto out; +} + +struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { + if (memcmp(©->cp_stateid, stateid, NFS4_STATEID_SIZE)) + continue; + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; + } + spin_unlock(&clp->async_lock); + return NULL; } static __be32 @@ -1127,7 +1327,18 @@ nfsd4_offload_cancel(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - return 0; + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) + nfsd4_stop_copy(copy); + else + status = nfserr_bad_stateid; + + return status; } static __be32 @@ -1157,7 +1368,19 @@ nfsd4_offload_status(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - return nfserr_notsupp; + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) { + os->count = copy->cp_res.wr_bytes_written; + nfs4_put_copy(copy); + } else + status = nfserr_bad_stateid; + + return status; } static __be32 diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b0ca0efd2875..07a57d024f95 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -713,6 +713,36 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla return NULL; } +/* + * Create a unique stateid_t to represent each COPY. + */ +int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy) +{ + int new_id; + + idr_preload(GFP_KERNEL); + spin_lock(&nn->s2s_cp_lock); + new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT); + spin_unlock(&nn->s2s_cp_lock); + idr_preload_end(); + if (new_id < 0) + return 0; + copy->cp_stateid.si_opaque.so_id = new_id; + copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time; + copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + return 1; +} + +void nfs4_free_cp_state(struct nfsd4_copy *copy) +{ + struct nfsd_net *nn; + + nn = net_generic(copy->cp_clp->net, nfsd_net_id); + spin_lock(&nn->s2s_cp_lock); + idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id); + spin_unlock(&nn->s2s_cp_lock); +} + static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; @@ -1827,6 +1857,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); #endif + INIT_LIST_HEAD(&clp->async_copies); + spin_lock_init(&clp->async_lock); spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; @@ -1942,6 +1974,7 @@ __destroy_client(struct nfs4_client *clp) } } nfsd4_return_all_client_layouts(clp); + nfsd4_shutdown_copy(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -2475,7 +2508,8 @@ static bool client_has_state(struct nfs4_client *clp) || !list_empty(&clp->cl_lo_states) #endif || !list_empty(&clp->cl_delegations) - || !list_empty(&clp->cl_sessions); + || !list_empty(&clp->cl_sessions) + || !list_empty(&clp->async_copies); } __be32 @@ -7161,6 +7195,8 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); + spin_lock_init(&nn->s2s_cp_lock); + idr_init(&nn->s2s_cp_stateids); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b78280a8af73..3de42a729093 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4231,15 +4231,27 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, + struct nfsd42_write_res *write, bool sync) { __be32 *p; - - p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_reserve_space(&resp->xdr, 4); + if (!p) + return nfserr_resource; + + if (sync) + *p++ = cpu_to_be32(0); + else { + __be32 nfserr; + *p++ = cpu_to_be32(1); + nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); + if (nfserr) + return nfserr; + } + p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(0); p = xdr_encode_hyper(p, write->wr_bytes_written); *p++ = cpu_to_be32(write->wr_stable_how); p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, @@ -4253,7 +4265,8 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + nfserr = nfsd42_encode_write_res(resp, ©->cp_res, + copy->cp_synchronous); if (nfserr) return nfserr; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7fb9f7c667b1..6384c9b94898 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1242,6 +1242,7 @@ static __net_init int nfsd_init_net(struct net *net) nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); + nn->s2s_cp_cl_id = nn->clientid_counter++; atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 6e38d9927448..6aacb325b6a0 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -355,6 +355,8 @@ struct nfs4_client { struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ struct net *net; + struct list_head async_copies; /* list of async copies */ + spinlock_t async_lock; /* lock for async copies */ }; /* struct nfs4_client_reset @@ -600,6 +602,7 @@ struct nfsd4_blocked_lock { struct nfsd4_compound_state; struct nfsd_net; +struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, @@ -609,6 +612,8 @@ __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); +int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy); +void nfs4_free_cp_state(struct nfsd4_copy *copy); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); @@ -627,6 +632,7 @@ extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); +extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); @@ -634,6 +640,9 @@ extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); +extern void nfs4_put_copy(struct nfsd4_copy *copy); +extern struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *staetid); static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 06cf218944c5..feeb6d4bdffd 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -532,6 +532,18 @@ struct nfsd4_copy { struct nfsd4_callback cp_cb; __be32 nfserr; struct knfsd_fh fh; + + struct nfs4_client *cp_clp; + + struct file *file_src; + struct file *file_dst; + + stateid_t cp_stateid; + + struct list_head copies; + struct task_struct *copy_task; + refcount_t refcount; + bool stopped; }; struct nfsd4_seek { From 7d20b6a2728fe3ea9fa8f4fc49cd438fcc781dd1 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 25 Sep 2018 11:22:53 +0000 Subject: [PATCH 05/30] nfsd: remove set but not used variable 'dirp' Fixes gcc '-Wunused-but-set-variable' warning: fs/nfsd/vfs.c: In function 'nfsd_create': fs/nfsd/vfs.c:1279:16: warning: variable 'dirp' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 55a099e47ba2..ddbac8776bd3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1275,7 +1275,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild = NULL; - struct inode *dirp; __be32 err; int host_err; @@ -1287,7 +1286,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; dentry = fhp->fh_dentry; - dirp = d_inode(dentry); host_err = fh_want_write(fhp); if (host_err) From 30382d6ce593f863292cd7c026a79077c4e7280f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:43 -0400 Subject: [PATCH 06/30] SUNRPC: Remove the server 'authtab_lock' and just use RCU Module removal is RCU safe by design, so we really have no need to lock the 'authtab[]' array. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth.c | 52 +++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index bb8db3cb8032..f83443856cd1 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -27,12 +27,32 @@ extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; -static DEFINE_SPINLOCK(authtab_lock); -static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = { - [0] = &svcauth_null, - [1] = &svcauth_unix, +static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null, + [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix, }; +static struct auth_ops * +svc_get_auth_ops(rpc_authflavor_t flavor) +{ + struct auth_ops *aops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + rcu_read_lock(); + aops = rcu_dereference(authtab[flavor]); + if (aops != NULL && !try_module_get(aops->owner)) + aops = NULL; + rcu_read_unlock(); + return aops; +} + +static void +svc_put_auth_ops(struct auth_ops *aops) +{ + module_put(aops->owner); +} + int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) { @@ -45,14 +65,11 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) dprintk("svc: svc_authenticate (%d)\n", flavor); - spin_lock(&authtab_lock); - if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) || - !try_module_get(aops->owner)) { - spin_unlock(&authtab_lock); + aops = svc_get_auth_ops(flavor); + if (aops == NULL) { *authp = rpc_autherr_badcred; return SVC_DENIED; } - spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; init_svc_cred(&rqstp->rq_cred); @@ -82,7 +99,7 @@ int svc_authorise(struct svc_rqst *rqstp) if (aops) { rv = aops->release(rqstp); - module_put(aops->owner); + svc_put_auth_ops(aops); } return rv; } @@ -90,13 +107,14 @@ int svc_authorise(struct svc_rqst *rqstp) int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) { + struct auth_ops *old; int rv = -EINVAL; - spin_lock(&authtab_lock); - if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) { - authtab[flavor] = aops; - rv = 0; + + if (flavor < RPC_AUTH_MAXFLAVOR) { + old = cmpxchg((struct auth_ops ** __force)&authtab[flavor], NULL, aops); + if (old == NULL || old == aops) + rv = 0; } - spin_unlock(&authtab_lock); return rv; } EXPORT_SYMBOL_GPL(svc_auth_register); @@ -104,10 +122,8 @@ EXPORT_SYMBOL_GPL(svc_auth_register); void svc_auth_unregister(rpc_authflavor_t flavor) { - spin_lock(&authtab_lock); if (flavor < RPC_AUTH_MAXFLAVOR) - authtab[flavor] = NULL; - spin_unlock(&authtab_lock); + rcu_assign_pointer(authtab[flavor], NULL); } EXPORT_SYMBOL_GPL(svc_auth_unregister); From 608a0ab2f54ab0e301ad76a41aad979ea0d02670 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:44 -0400 Subject: [PATCH 07/30] SUNRPC: Add lockless lookup of the server's auth domain Avoid taking the global auth_domain_lock in most lookups of the auth domain by adding an RCU protected lookup. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svcauth.h | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 9 ++++++++- net/sunrpc/svcauth.c | 22 +++++++++++++++++++--- net/sunrpc/svcauth_unix.c | 10 ++++++++-- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 04e404a07882..3e53a6e2ada7 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -82,6 +82,7 @@ struct auth_domain { struct hlist_node hash; char *name; struct auth_ops *flavour; + struct rcu_head rcu_head; }; /* diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 860f2a1bbb67..87c71fb0f0ea 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1764,14 +1764,21 @@ svcauth_gss_release(struct svc_rqst *rqstp) } static void -svcauth_gss_domain_release(struct auth_domain *dom) +svcauth_gss_domain_release_rcu(struct rcu_head *head) { + struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } +static void +svcauth_gss_domain_release(struct auth_domain *dom) +{ + call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); +} + static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index f83443856cd1..775b8c94265b 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -143,10 +143,11 @@ static struct hlist_head auth_domain_table[DN_HASHMAX]; static DEFINE_SPINLOCK(auth_domain_lock); static void auth_domain_release(struct kref *kref) + __releases(&auth_domain_lock) { struct auth_domain *dom = container_of(kref, struct auth_domain, ref); - hlist_del(&dom->hash); + hlist_del_rcu(&dom->hash); dom->flavour->domain_release(dom); spin_unlock(&auth_domain_lock); } @@ -175,7 +176,7 @@ auth_domain_lookup(char *name, struct auth_domain *new) } } if (new) - hlist_add_head(&new->hash, head); + hlist_add_head_rcu(&new->hash, head); spin_unlock(&auth_domain_lock); return new; } @@ -183,6 +184,21 @@ EXPORT_SYMBOL_GPL(auth_domain_lookup); struct auth_domain *auth_domain_find(char *name) { - return auth_domain_lookup(name, NULL); + struct auth_domain *hp; + struct hlist_head *head; + + head = &auth_domain_table[hash_str(name, DN_HASHBITS)]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(hp, head, hash) { + if (strcmp(hp->name, name)==0) { + if (!kref_get_unless_zero(&hp->ref)) + hp = NULL; + rcu_read_unlock(); + return hp; + } + } + rcu_read_unlock(); + return NULL; } EXPORT_SYMBOL_GPL(auth_domain_find); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index af7f28fb8102..84cf39021a03 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -37,20 +37,26 @@ struct unix_domain { extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; -static void svcauth_unix_domain_release(struct auth_domain *dom) +static void svcauth_unix_domain_release_rcu(struct rcu_head *head) { + struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct unix_domain *ud = container_of(dom, struct unix_domain, h); kfree(dom->name); kfree(ud); } +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + call_rcu(&dom->rcu_head, svcauth_unix_domain_release_rcu); +} + struct auth_domain *unix_domain_find(char *name) { struct auth_domain *rv; struct unix_domain *new = NULL; - rv = auth_domain_lookup(name, NULL); + rv = auth_domain_find(name); while(1) { if (rv) { if (new && rv != &new->h) From b92a8fababa9d18910a54b957be55fef7f603531 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:45 -0400 Subject: [PATCH 08/30] SUNRPC: Refactor sunrpc_cache_lookup This is a trivial split into lookup and insert functions, no change in behavior. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 109fbe591e7b..aa8e62d61f4d 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,13 +54,11 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } -struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, - struct cache_head *key, int hash) +static struct cache_head *sunrpc_cache_find(struct cache_detail *detail, + struct cache_head *key, int hash) { - struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL; - struct hlist_head *head; - - head = &detail->hash_table[hash]; + struct hlist_head *head = &detail->hash_table[hash]; + struct cache_head *tmp; read_lock(&detail->hash_lock); @@ -75,7 +73,15 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, } } read_unlock(&detail->hash_lock); - /* Didn't find anything, insert an empty entry */ + return NULL; +} + +static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, + struct cache_head *key, + int hash) +{ + struct cache_head *new, *tmp, *freeme = NULL; + struct hlist_head *head = &detail->hash_table[hash]; new = detail->alloc(); if (!new) @@ -114,8 +120,19 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, cache_put(freeme, detail); return new; } -EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); +struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) +{ + struct cache_head *ret; + + ret = sunrpc_cache_find(detail, key, hash); + if (ret) + return ret; + /* Didn't find anything, insert an empty entry */ + return sunrpc_cache_add_entry(detail, key, hash); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); From ae74136b4bb64440a55117e12065b8c282ab6c1a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Oct 2018 12:01:22 -0400 Subject: [PATCH 09/30] SUNRPC: Allow cache lookups to use RCU protection rather than the r/w spinlock Instead of the reader/writer spinlock, allow cache lookups to use RCU for looking up entries. This is more efficient since modifications can occur while other entries are being looked up. Note that for now, we keep the reader/writer spinlock until all users have been converted to use RCU-safe freeing of their cache entries. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 12 +++++ net/sunrpc/cache.c | 93 ++++++++++++++++++++++++++++++------ 2 files changed, 91 insertions(+), 14 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 40d2822f0e2f..cf3e17ee2786 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -167,6 +167,9 @@ extern const struct file_operations cache_file_operations_pipefs; extern const struct file_operations content_file_operations_pipefs; extern const struct file_operations cache_flush_operations_pipefs; +extern struct cache_head * +sunrpc_cache_lookup_rcu(struct cache_detail *detail, + struct cache_head *key, int hash); extern struct cache_head * sunrpc_cache_lookup(struct cache_detail *detail, struct cache_head *key, int hash); @@ -186,6 +189,12 @@ static inline struct cache_head *cache_get(struct cache_head *h) return h; } +static inline struct cache_head *cache_get_rcu(struct cache_head *h) +{ + if (kref_get_unless_zero(&h->ref)) + return h; + return NULL; +} static inline void cache_put(struct cache_head *h, struct cache_detail *cd) { @@ -227,6 +236,9 @@ extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *); extern void *cache_seq_start(struct seq_file *file, loff_t *pos); extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos); extern void cache_seq_stop(struct seq_file *file, void *p); +extern void *cache_seq_start_rcu(struct seq_file *file, loff_t *pos); +extern void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos); +extern void cache_seq_stop_rcu(struct seq_file *file, void *p); extern void qword_add(char **bpp, int *lp, char *str); extern void qword_addhex(char **bpp, int *lp, char *buf, int blen); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index aa8e62d61f4d..7593afed9036 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,6 +54,27 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } +static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, + struct cache_head *key, + int hash) +{ + struct hlist_head *head = &detail->hash_table[hash]; + struct cache_head *tmp; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp, head, cache_list) { + if (detail->match(tmp, key)) { + if (cache_is_expired(detail, tmp)) + continue; + tmp = cache_get_rcu(tmp); + rcu_read_unlock(); + return tmp; + } + } + rcu_read_unlock(); + return NULL; +} + static struct cache_head *sunrpc_cache_find(struct cache_detail *detail, struct cache_head *key, int hash) { @@ -61,7 +82,6 @@ static struct cache_head *sunrpc_cache_find(struct cache_detail *detail, struct cache_head *tmp; read_lock(&detail->hash_lock); - hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) @@ -96,10 +116,10 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, write_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - hlist_for_each_entry(tmp, head, cache_list) { + hlist_for_each_entry_rcu(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - hlist_del_init(&tmp->cache_list); + hlist_del_init_rcu(&tmp->cache_list); detail->entries --; freeme = tmp; break; @@ -111,7 +131,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, } } - hlist_add_head(&new->cache_list, head); + hlist_add_head_rcu(&new->cache_list, head); detail->entries++; cache_get(new); write_unlock(&detail->hash_lock); @@ -121,6 +141,19 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, return new; } +struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail, + struct cache_head *key, int hash) +{ + struct cache_head *ret; + + ret = sunrpc_cache_find_rcu(detail, key, hash); + if (ret) + return ret; + /* Didn't find anything, insert an empty entry */ + return sunrpc_cache_add_entry(detail, key, hash); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu); + struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, struct cache_head *key, int hash) { @@ -134,6 +167,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, } EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); + static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); static void cache_fresh_locked(struct cache_head *head, time_t expiry, @@ -450,7 +484,7 @@ static int cache_clean(void) if (!cache_is_expired(current_detail, ch)) continue; - hlist_del_init(&ch->cache_list); + hlist_del_init_rcu(&ch->cache_list); current_detail->entries--; rv = 1; break; @@ -521,7 +555,7 @@ void cache_purge(struct cache_detail *detail) for (i = 0; i < detail->hash_size; i++) { head = &detail->hash_table[i]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { - hlist_del_init(&ch->cache_list); + hlist_del_init_rcu(&ch->cache_list); detail->entries--; set_bit(CACHE_CLEANED, &ch->flags); @@ -1306,21 +1340,19 @@ EXPORT_SYMBOL_GPL(qword_get); * get a header, then pass each real item in the cache */ -void *cache_seq_start(struct seq_file *m, loff_t *pos) - __acquires(cd->hash_lock) +static void *__cache_seq_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; struct cache_detail *cd = m->private; - read_lock(&cd->hash_lock); if (!n--) return SEQ_START_TOKEN; hash = n >> 32; entry = n & ((1LL<<32) - 1); - hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list) + hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); @@ -1332,9 +1364,19 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos) if (hash >= cd->hash_size) return NULL; *pos = n+1; - return hlist_entry_safe(cd->hash_table[hash].first, + return hlist_entry_safe(rcu_dereference_raw( + hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } + +void *cache_seq_start(struct seq_file *m, loff_t *pos) + __acquires(cd->hash_lock) +{ + struct cache_detail *cd = m->private; + + read_lock(&cd->hash_lock); + return __cache_seq_start(m, pos); +} EXPORT_SYMBOL_GPL(cache_seq_start); void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) @@ -1350,7 +1392,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) *pos += 1LL<<32; } else { ++*pos; - return hlist_entry_safe(ch->cache_list.next, + return hlist_entry_safe(rcu_dereference_raw( + hlist_next_rcu(&ch->cache_list)), struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); @@ -1362,7 +1405,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) if (hash >= cd->hash_size) return NULL; ++*pos; - return hlist_entry_safe(cd->hash_table[hash].first, + return hlist_entry_safe(rcu_dereference_raw( + hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } EXPORT_SYMBOL_GPL(cache_seq_next); @@ -1375,6 +1419,27 @@ void cache_seq_stop(struct seq_file *m, void *p) } EXPORT_SYMBOL_GPL(cache_seq_stop); +void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return __cache_seq_start(m, pos); +} +EXPORT_SYMBOL_GPL(cache_seq_start_rcu); + +void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos) +{ + return cache_seq_next(file, p, pos); +} +EXPORT_SYMBOL_GPL(cache_seq_next_rcu); + +void cache_seq_stop_rcu(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(cache_seq_stop_rcu); + static int c_show(struct seq_file *m, void *p) { struct cache_head *cp = p; @@ -1863,7 +1928,7 @@ void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) { write_lock(&cd->hash_lock); if (!hlist_unhashed(&h->cache_list)){ - hlist_del_init(&h->cache_list); + hlist_del_init_rcu(&h->cache_list); cd->entries--; write_unlock(&cd->hash_lock); cache_put(h, cd); From fd5d2f78261bedd3a40feb1474323fddb88398b8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:46 -0400 Subject: [PATCH 10/30] SUNRPC: Make server side AUTH_UNIX use lockless lookups Convert structs ip_map and unix_gid to use RCU protected lookups. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth_unix.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 84cf39021a03..fb9041b92f72 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -97,6 +97,7 @@ struct ip_map { char m_class[8]; /* e.g. "nfsd" */ struct in6_addr m_addr; struct unix_domain *m_client; + struct rcu_head m_rcu; }; static void ip_map_put(struct kref *kref) @@ -107,7 +108,7 @@ static void ip_map_put(struct kref *kref) if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) auth_domain_put(&im->m_client->h); - kfree(im); + kfree_rcu(im, m_rcu); } static inline int hash_ip6(const struct in6_addr *ip) @@ -286,9 +287,9 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, strcpy(ip.m_class, class); ip.m_addr = *addr; - ch = sunrpc_cache_lookup(cd, &ip.h, - hash_str(class, IP_HASHBITS) ^ - hash_ip6(addr)); + ch = sunrpc_cache_lookup_rcu(cd, &ip.h, + hash_str(class, IP_HASHBITS) ^ + hash_ip6(addr)); if (ch) return container_of(ch, struct ip_map, h); @@ -418,6 +419,7 @@ struct unix_gid { struct cache_head h; kuid_t uid; struct group_info *gi; + struct rcu_head rcu; }; static int unix_gid_hash(kuid_t uid) @@ -432,7 +434,7 @@ static void unix_gid_put(struct kref *kref) if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) put_group_info(ug->gi); - kfree(ug); + kfree_rcu(ug, rcu); } static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) @@ -625,7 +627,7 @@ static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid) struct cache_head *ch; ug.uid = uid; - ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid)); + ch = sunrpc_cache_lookup_rcu(cd, &ug.h, unix_gid_hash(uid)); if (ch) return container_of(ch, struct unix_gid, h); else From 9ceddd9da13434a5906255c0fc528c385aded283 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:47 -0400 Subject: [PATCH 11/30] knfsd: Allow lockless lookups of the exports Convert structs svc_expkey and svc_export to allow RCU protected lookups. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 14 +++++++------- fs/nfsd/export.h | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index a1143f7c2201..802993d8912f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -46,7 +46,7 @@ static void expkey_put(struct kref *ref) !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); - kfree(key); + kfree_rcu(key, ek_rcu); } static void expkey_request(struct cache_detail *cd, @@ -265,7 +265,7 @@ svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) struct cache_head *ch; int hash = svc_expkey_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else @@ -314,7 +314,7 @@ static void svc_export_put(struct kref *ref) auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp->ex_uuid); - kfree(exp); + kfree_rcu(exp, ex_rcu); } static void svc_export_request(struct cache_detail *cd, @@ -780,7 +780,7 @@ svc_export_lookup(struct svc_export *exp) struct cache_head *ch; int hash = svc_export_hash(exp); - ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash); + ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else @@ -1216,9 +1216,9 @@ static int e_show(struct seq_file *m, void *p) } const struct seq_operations nfs_exports_op = { - .start = cache_seq_start, - .next = cache_seq_next, - .stop = cache_seq_stop, + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, .show = e_show, }; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index c8b74126ddaa..e7daa1f246f0 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -61,6 +61,7 @@ struct svc_export { u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; + struct rcu_head ex_rcu; }; /* an "export key" (expkey) maps a filehandlefragement to an @@ -75,6 +76,7 @@ struct svc_expkey { u32 ek_fsid[6]; struct path ek_path; + struct rcu_head ek_rcu; }; #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) From 6d1616b26cd91f2502111d098cd9c288dbafe5c8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:48 -0400 Subject: [PATCH 12/30] SUNRPC: Lockless server RPCSEC_GSS context lookup Use RCU protection for looking up the RPCSEC_GSS context. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 32 +++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 87c71fb0f0ea..1ece4bc3eb8d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -76,6 +76,7 @@ struct rsi { struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; + struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); @@ -89,11 +90,19 @@ static void rsi_free(struct rsi *rsii) kfree(rsii->out_token.data); } +static void rsi_free_rcu(struct rcu_head *head) +{ + struct rsi *rsii = container_of(head, struct rsi, rcu_head); + + rsi_free(rsii); + kfree(rsii); +} + static void rsi_put(struct kref *ref) { struct rsi *rsii = container_of(ref, struct rsi, h.ref); - rsi_free(rsii); - kfree(rsii); + + call_rcu(&rsii->rcu_head, rsi_free_rcu); } static inline int rsi_hash(struct rsi *item) @@ -282,7 +291,7 @@ static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) struct cache_head *ch; int hash = rsi_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else @@ -330,6 +339,7 @@ struct rsc { struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; + struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); @@ -343,12 +353,22 @@ static void rsc_free(struct rsc *rsci) free_svc_cred(&rsci->cred); } +static void rsc_free_rcu(struct rcu_head *head) +{ + struct rsc *rsci = container_of(head, struct rsc, rcu_head); + + kfree(rsci->handle.data); + kfree(rsci); +} + static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); - rsc_free(rsci); - kfree(rsci); + if (rsci->mechctx) + gss_delete_sec_context(&rsci->mechctx); + free_svc_cred(&rsci->cred); + call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int @@ -542,7 +562,7 @@ static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) struct cache_head *ch; int hash = rsc_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else From 9d5afd9491c80779730686159aeec7fa06ead085 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:49 -0400 Subject: [PATCH 13/30] knfsd: Lockless lookup of NFSv4 identities. Enable RCU protected lookups of the NFSv4 idmap. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4idmap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a5bb76593ce7..bf137fec33ff 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -65,6 +65,7 @@ struct ent { u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; + struct rcu_head rcu_head; }; /* Common entry handling */ @@ -89,7 +90,7 @@ static void ent_put(struct kref *ref) { struct ent *map = container_of(ref, struct ent, h.ref); - kfree(map); + kfree_rcu(map, rcu_head); } static struct cache_head * @@ -264,8 +265,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { - struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, - idtoname_hash(item)); + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + idtoname_hash(item)); if (ch) return container_of(ch, struct ent, h); else @@ -422,8 +423,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) static struct ent * nametoid_lookup(struct cache_detail *cd, struct ent *item) { - struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, - nametoid_hash(item)); + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + nametoid_hash(item)); if (ch) return container_of(ch, struct ent, h); else From 437f91451349a94d581cb9a928fa3958153f0b71 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:50 -0400 Subject: [PATCH 14/30] NFS: Lockless DNS lookups Enable RCU protected lookup in the legacy DNS resolver. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfs/dns_resolve.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 060c658eab66..e93a5dc07c8c 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -65,6 +65,7 @@ struct nfs_dns_ent { struct sockaddr_storage addr; size_t addrlen; + struct rcu_head rcu_head; }; @@ -101,13 +102,21 @@ static void nfs_dns_ent_init(struct cache_head *cnew, } } +static void nfs_dns_ent_free_rcu(struct rcu_head *head) +{ + struct nfs_dns_ent *item; + + item = container_of(head, struct nfs_dns_ent, rcu_head); + kfree(item->hostname); + kfree(item); +} + static void nfs_dns_ent_put(struct kref *ref) { struct nfs_dns_ent *item; item = container_of(ref, struct nfs_dns_ent, h.ref); - kfree(item->hostname); - kfree(item); + call_rcu(item, nfs_dns_ent_free_rcu); } static struct cache_head *nfs_dns_ent_alloc(void) @@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, { struct cache_head *ch; - ch = sunrpc_cache_lookup(cd, + ch = sunrpc_cache_lookup_rcu(cd, &key->h, nfs_dns_hash(key)); if (!ch) From a6482733bc98aa04b44c91e744bbcbfb29f8d3ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Oct 2018 10:27:20 -0400 Subject: [PATCH 15/30] NFS: Fix up a typo in nfs_dns_ent_put call_rcu() needs to take a first argument of type (struct rcu_head *). Fixes: fd497f1e40d9 ("NFS: Lockless DNS lookups") Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfs/dns_resolve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index e93a5dc07c8c..a7d3df85736d 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -116,7 +116,7 @@ static void nfs_dns_ent_put(struct kref *ref) struct nfs_dns_ent *item; item = container_of(ref, struct nfs_dns_ent, h.ref); - call_rcu(item, nfs_dns_ent_free_rcu); + call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu); } static struct cache_head *nfs_dns_ent_alloc(void) From d48cf356a13073853f19be6ca5ebbecfc2762ebe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:51 -0400 Subject: [PATCH 16/30] SUNRPC: Remove non-RCU protected lookup Clean up the cache code by removing the non-RCU protected lookup. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- Documentation/filesystems/nfs/rpc-cache.txt | 6 +- include/linux/sunrpc/cache.h | 6 -- net/sunrpc/cache.c | 61 ++------------------- 3 files changed, 7 insertions(+), 66 deletions(-) diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt index ebcaaee21616..c4dac829db0f 100644 --- a/Documentation/filesystems/nfs/rpc-cache.txt +++ b/Documentation/filesystems/nfs/rpc-cache.txt @@ -84,7 +84,7 @@ Creating a Cache A message from user space has arrived to fill out a cache entry. It is in 'buf' of length 'len'. cache_parse should parse this, find the item in the - cache with sunrpc_cache_lookup, and update the item + cache with sunrpc_cache_lookup_rcu, and update the item with sunrpc_cache_update. @@ -95,7 +95,7 @@ Creating a Cache Using a cache ------------- -To find a value in a cache, call sunrpc_cache_lookup passing a pointer +To find a value in a cache, call sunrpc_cache_lookup_rcu passing a pointer to the cache_head in a sample item with the 'key' fields filled in. This will be passed to ->match to identify the target entry. If no entry is found, a new entry will be create, added to the cache, and @@ -116,7 +116,7 @@ item does become valid, the deferred copy of the request will be revisited (->revisit). It is expected that this method will reschedule the request for processing. -The value returned by sunrpc_cache_lookup can also be passed to +The value returned by sunrpc_cache_lookup_rcu can also be passed to sunrpc_cache_update to set the content for the item. A second item is passed which should hold the content. If the item found by _lookup has valid data, then it is discarded and a new item is created. This diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index cf3e17ee2786..c3d67e893430 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -171,9 +171,6 @@ extern struct cache_head * sunrpc_cache_lookup_rcu(struct cache_detail *detail, struct cache_head *key, int hash); extern struct cache_head * -sunrpc_cache_lookup(struct cache_detail *detail, - struct cache_head *key, int hash); -extern struct cache_head * sunrpc_cache_update(struct cache_detail *detail, struct cache_head *new, struct cache_head *old, int hash); @@ -233,9 +230,6 @@ extern void sunrpc_cache_unregister_pipefs(struct cache_detail *); extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *); /* Must store cache_detail in seq_file->private if using next three functions */ -extern void *cache_seq_start(struct seq_file *file, loff_t *pos); -extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos); -extern void cache_seq_stop(struct seq_file *file, void *p); extern void *cache_seq_start_rcu(struct seq_file *file, loff_t *pos); extern void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos); extern void cache_seq_stop_rcu(struct seq_file *file, void *p); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7593afed9036..593cf8607414 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -75,27 +75,6 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, return NULL; } -static struct cache_head *sunrpc_cache_find(struct cache_detail *detail, - struct cache_head *key, int hash) -{ - struct hlist_head *head = &detail->hash_table[hash]; - struct cache_head *tmp; - - read_lock(&detail->hash_lock); - hlist_for_each_entry(tmp, head, cache_list) { - if (detail->match(tmp, key)) { - if (cache_is_expired(detail, tmp)) - /* This entry is expired, we will discard it. */ - break; - cache_get(tmp); - read_unlock(&detail->hash_lock); - return tmp; - } - } - read_unlock(&detail->hash_lock); - return NULL; -} - static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, struct cache_head *key, int hash) @@ -154,20 +133,6 @@ struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail, } EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu); -struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, - struct cache_head *key, int hash) -{ - struct cache_head *ret; - - ret = sunrpc_cache_find(detail, key, hash); - if (ret) - return ret; - /* Didn't find anything, insert an empty entry */ - return sunrpc_cache_add_entry(detail, key, hash); -} -EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); - - static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); static void cache_fresh_locked(struct cache_head *head, time_t expiry, @@ -1369,17 +1334,7 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos) struct cache_head, cache_list); } -void *cache_seq_start(struct seq_file *m, loff_t *pos) - __acquires(cd->hash_lock) -{ - struct cache_detail *cd = m->private; - - read_lock(&cd->hash_lock); - return __cache_seq_start(m, pos); -} -EXPORT_SYMBOL_GPL(cache_seq_start); - -void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) +static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); @@ -1411,14 +1366,6 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) } EXPORT_SYMBOL_GPL(cache_seq_next); -void cache_seq_stop(struct seq_file *m, void *p) - __releases(cd->hash_lock) -{ - struct cache_detail *cd = m->private; - read_unlock(&cd->hash_lock); -} -EXPORT_SYMBOL_GPL(cache_seq_stop); - void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) __acquires(RCU) { @@ -1466,9 +1413,9 @@ static int c_show(struct seq_file *m, void *p) } static const struct seq_operations cache_content_op = { - .start = cache_seq_start, - .next = cache_seq_next, - .stop = cache_seq_stop, + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, .show = c_show, }; From 1863d77f15da0addcd293a1719fa5d3ef8cde3ca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:52 -0400 Subject: [PATCH 17/30] SUNRPC: Replace the cache_detail->hash_lock with a regular spinlock Now that the reader functions are all RCU protected, use a regular spinlock rather than a reader/writer lock. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 2 +- net/sunrpc/cache.c | 46 ++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index c3d67e893430..5a3e95017fc6 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -67,7 +67,7 @@ struct cache_detail { struct module * owner; int hash_size; struct hlist_head * hash_table; - rwlock_t hash_lock; + spinlock_t hash_lock; char *name; void (*cache_put)(struct kref *); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 593cf8607414..f96345b1180e 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -92,7 +92,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, cache_init(new, detail); detail->init(new, key); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); /* check if entry appeared while we slept */ hlist_for_each_entry_rcu(tmp, head, cache_list) { @@ -104,7 +104,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, break; } cache_get(tmp); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_put(new, detail); return tmp; } @@ -113,7 +113,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, hlist_add_head_rcu(&new->cache_list, head); detail->entries++; cache_get(new); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); if (freeme) cache_put(freeme, detail); @@ -167,18 +167,18 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); cache_fresh_locked(old, new->expiry_time, detail); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); } /* We need to insert a new entry */ tmp = detail->alloc(); @@ -189,7 +189,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_init(tmp, detail); detail->init(tmp, old); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &tmp->flags); else @@ -199,7 +199,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_get(tmp); cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); cache_put(old, detail); @@ -239,7 +239,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h { int rv; - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); @@ -247,7 +247,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h detail); rv = -ENOENT; } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(h, detail); return rv; } @@ -357,7 +357,7 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { - rwlock_init(&cd->hash_lock); + spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); cd->nextcheck = 0; @@ -377,11 +377,11 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) { cache_purge(cd); spin_lock(&cache_list_lock); - write_lock(&cd->hash_lock); + spin_lock(&cd->hash_lock); if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); if (list_empty(&cache_list)) { /* module must be being unloaded so its safe to kill the worker */ @@ -438,7 +438,7 @@ static int cache_clean(void) struct hlist_head *head; struct hlist_node *tmp; - write_lock(¤t_detail->hash_lock); + spin_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ @@ -455,7 +455,7 @@ static int cache_clean(void) break; } - write_unlock(¤t_detail->hash_lock); + spin_unlock(¤t_detail->hash_lock); d = current_detail; if (!ch) current_index ++; @@ -510,9 +510,9 @@ void cache_purge(struct cache_detail *detail) struct hlist_node *tmp = NULL; int i = 0; - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (!detail->entries) { - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); return; } @@ -524,13 +524,13 @@ void cache_purge(struct cache_detail *detail) detail->entries--; set_bit(CACHE_CLEANED, &ch->flags); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(ch, detail); cache_put(ch, detail); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); } } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -1873,13 +1873,13 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) { - write_lock(&cd->hash_lock); + spin_lock(&cd->hash_lock); if (!hlist_unhashed(&h->cache_list)){ hlist_del_init_rcu(&h->cache_list); cd->entries--; - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); cache_put(h, cd); } else - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); } EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); From 4c8e5537bb28f0e4b2ca60ecb0ca1ea12b82adf6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:53 -0400 Subject: [PATCH 18/30] SUNRPC: Simplify TCP receive code Use the fact that the iov iterators already have functionality for skipping a base offset. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 53 ++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5445145e639c..64765f08ae07 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -325,59 +325,34 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) /* * Generic recvfrom routine. */ -static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, - int buflen) +static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, + unsigned int nr, size_t buflen, unsigned int base) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; - int len; + struct msghdr msg = { NULL }; + ssize_t len; rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen); - len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags); + if (base != 0) { + iov_iter_advance(&msg.msg_iter, base); + buflen -= base; + } + len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n", + dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n", svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } -static int svc_partial_recvfrom(struct svc_rqst *rqstp, - struct kvec *iov, int nr, - int buflen, unsigned int base) -{ - size_t save_iovlen; - void *save_iovbase; - unsigned int i; - int ret; - - if (base == 0) - return svc_recvfrom(rqstp, iov, nr, buflen); - - for (i = 0; i < nr; i++) { - if (iov[i].iov_len > base) - break; - base -= iov[i].iov_len; - } - save_iovlen = iov[i].iov_len; - save_iovbase = iov[i].iov_base; - iov[i].iov_len -= base; - iov[i].iov_base += base; - ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen); - iov[i].iov_len = save_iovlen; - iov[i].iov_base = save_iovbase; - return ret; -} - /* * Set socket snd and rcv buffer lengths */ @@ -962,7 +937,8 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; iov.iov_len = want; - if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + len = svc_recvfrom(rqstp, &iov, 1, want, 0); + if (len < 0) goto error; svsk->sk_tcplen += len; @@ -1088,14 +1064,13 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) vec = rqstp->rq_vec; - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], - svsk->sk_datalen + want); + pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); rqstp->rq_respages = &rqstp->rq_pages[pnum]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Now receive data */ - len = svc_partial_recvfrom(rqstp, vec, pnum, want, base); + len = svc_recvfrom(rqstp, vec, pnum, base + want, base); if (len >= 0) { svsk->sk_tcplen += len; svsk->sk_datalen += len; From 3e87da5145fc25e18fb934eb496f4e7c4d038e71 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:54 -0400 Subject: [PATCH 19/30] knfsd: Remove dead code from nfsd_cache_lookup The preallocated cache entry is always set to type RC_NOCACHE, and that type isn't changed until we later call nfsd_cache_update(). Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index dbdeb9d6af03..cef4686f87ef 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -446,14 +446,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) rp->c_csum = csum; lru_put_end(b, rp); - - /* release any buffer */ - if (rp->c_type == RC_REPLBUFF) { - drc_mem_usage -= rp->c_replvec.iov_len; - kfree(rp->c_replvec.iov_base); - rp->c_replvec.iov_base = NULL; - } - rp->c_type = RC_NOCACHE; out: spin_unlock(&b->cache_lock); return rtn; From 76ecec21197ab23bb821d8bf584949013efd0494 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:55 -0400 Subject: [PATCH 20/30] knfsd: Simplify NFS duplicate replay cache Simplify the duplicate replay cache by initialising the preallocated cache entry, so that we can use it as a key for the cache lookup. Note that the 99.999% case we want to optimise for is still the one where the lookup fails, and we have to add this entry to the cache, so preinitialising should not cause a performance penalty. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 98 ++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 52 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index cef4686f87ef..527ce4c65765 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -121,7 +121,7 @@ nfsd_cache_hash(__be32 xid) } static struct svc_cacherep * -nfsd_reply_cache_alloc(void) +nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) { struct svc_cacherep *rp; @@ -130,6 +130,16 @@ nfsd_reply_cache_alloc(void) rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; INIT_LIST_HEAD(&rp->c_lru); + + rp->c_xid = rqstp->rq_xid; + rp->c_proc = rqstp->rq_proc; + memset(&rp->c_addr, 0, sizeof(rp->c_addr)); + rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); + rp->c_prot = rqstp->rq_prot; + rp->c_vers = rqstp->rq_vers; + rp->c_len = rqstp->rq_arg.len; + rp->c_csum = csum; } return rp; } @@ -141,9 +151,11 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); } - list_del(&rp->c_lru); - atomic_dec(&num_drc_entries); - drc_mem_usage -= sizeof(*rp); + if (rp->c_state != RC_UNUSED) { + list_del(&rp->c_lru); + atomic_dec(&num_drc_entries); + drc_mem_usage -= sizeof(*rp); + } kmem_cache_free(drc_slab, rp); } @@ -319,24 +331,23 @@ nfsd_cache_csum(struct svc_rqst *rqstp) } static bool -nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) +nfsd_cache_match(const struct svc_cacherep *key, const struct svc_cacherep *rp) { /* Check RPC XID first */ - if (rqstp->rq_xid != rp->c_xid) + if (key->c_xid != rp->c_xid) return false; /* compare checksum of NFS data */ - if (csum != rp->c_csum) { + if (key->c_csum != rp->c_csum) { ++payload_misses; return false; } /* Other discriminators */ - if (rqstp->rq_proc != rp->c_proc || - rqstp->rq_prot != rp->c_prot || - rqstp->rq_vers != rp->c_vers || - rqstp->rq_arg.len != rp->c_len || - !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || - rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + if (key->c_proc != rp->c_proc || + key->c_prot != rp->c_prot || + key->c_vers != rp->c_vers || + key->c_len != rp->c_len || + memcmp(&key->c_addr, &rp->c_addr, sizeof(key->c_addr)) != 0) return false; return true; @@ -345,19 +356,18 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) /* * Search the request hash for an entry that matches the given rqstp. * Must be called with cache_lock held. Returns the found entry or - * NULL on failure. + * inserts an empty key on failure. */ static struct svc_cacherep * -nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, - __wsum csum) +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) { - struct svc_cacherep *rp, *ret = NULL; + struct svc_cacherep *rp, *ret = key; struct list_head *rh = &b->lru_head; unsigned int entries = 0; list_for_each_entry(rp, rh, c_lru) { ++entries; - if (nfsd_cache_match(rqstp, csum, rp)) { + if (nfsd_cache_match(key, rp)) { ret = rp; break; } @@ -374,6 +384,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, atomic_read(&num_drc_entries)); } + lru_put_end(b, ret); return ret; } @@ -389,9 +400,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) { struct svc_cacherep *rp, *found; __be32 xid = rqstp->rq_xid; - u32 proto = rqstp->rq_prot, - vers = rqstp->rq_vers, - proc = rqstp->rq_proc; __wsum csum; u32 hash = nfsd_cache_hash(xid); struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; @@ -410,52 +418,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - rp = nfsd_reply_cache_alloc(); - spin_lock(&b->cache_lock); - if (likely(rp)) { - atomic_inc(&num_drc_entries); - drc_mem_usage += sizeof(*rp); - } - - /* go ahead and prune the cache */ - prune_bucket(b); - - found = nfsd_cache_search(b, rqstp, csum); - if (found) { - if (likely(rp)) - nfsd_reply_cache_free_locked(rp); - rp = found; - goto found_entry; - } - + rp = nfsd_reply_cache_alloc(rqstp, csum); if (!rp) { dprintk("nfsd: unable to allocate DRC entry!\n"); - goto out; + return rtn; + } + + spin_lock(&b->cache_lock); + found = nfsd_cache_insert(b, rp); + if (found != rp) { + nfsd_reply_cache_free_locked(rp); + rp = found; + goto found_entry; } nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; - rp->c_xid = xid; - rp->c_proc = proc; - rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); - rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); - rp->c_prot = proto; - rp->c_vers = vers; - rp->c_len = rqstp->rq_arg.len; - rp->c_csum = csum; - lru_put_end(b, rp); + atomic_inc(&num_drc_entries); + drc_mem_usage += sizeof(*rp); + + /* go ahead and prune the cache */ + prune_bucket(b); out: spin_unlock(&b->cache_lock); return rtn; found_entry: - nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ - lru_put_end(b, rp); - + nfsdstats.rchits++; rtn = RC_DROPIT; + /* Request being processed */ if (rp->c_state == RC_INPROG) goto out; From ed00c2f65267f3a5a8727ac74a90d32470f91679 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Oct 2018 13:11:51 -0400 Subject: [PATCH 21/30] knfsd: Further simplify the cache lookup Order the structure so that the key can be compared using memcmp(). Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/cache.h | 19 +++++++++++-------- fs/nfsd/nfscache.c | 45 ++++++++++++++++----------------------------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index b7559c6f2b97..745c861237ca 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,18 +19,21 @@ * is much larger than a sockaddr_in6. */ struct svc_cacherep { - struct list_head c_lru; + struct { + /* Keep often-read xid, csum in the same cache line: */ + __be32 k_xid; + __wsum k_csum; + u32 k_proc; + u32 k_prot; + u32 k_vers; + unsigned int k_len; + struct sockaddr_in6 k_addr; + } c_key; + struct list_head c_lru; unsigned char c_state, /* unused, inprog, done */ c_type, /* status, buffer */ c_secure : 1; /* req came from port < 1024 */ - struct sockaddr_in6 c_addr; - __be32 c_xid; - u32 c_prot; - u32 c_proc; - u32 c_vers; - unsigned int c_len; - __wsum c_csum; unsigned long c_timestamp; union { struct kvec u_vec; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 527ce4c65765..230cc83921ad 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -131,15 +131,15 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) rp->c_type = RC_NOCACHE; INIT_LIST_HEAD(&rp->c_lru); - rp->c_xid = rqstp->rq_xid; - rp->c_proc = rqstp->rq_proc; - memset(&rp->c_addr, 0, sizeof(rp->c_addr)); - rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); - rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); - rp->c_prot = rqstp->rq_prot; - rp->c_vers = rqstp->rq_vers; - rp->c_len = rqstp->rq_arg.len; - rp->c_csum = csum; + memset(&rp->c_key, 0, sizeof(rp->c_key)); + rp->c_key.k_xid = rqstp->rq_xid; + rp->c_key.k_proc = rqstp->rq_proc; + rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp))); + rp->c_key.k_prot = rqstp->rq_prot; + rp->c_key.k_vers = rqstp->rq_vers; + rp->c_key.k_len = rqstp->rq_arg.len; + rp->c_key.k_csum = csum; } return rp; } @@ -330,27 +330,14 @@ nfsd_cache_csum(struct svc_rqst *rqstp) return csum; } -static bool -nfsd_cache_match(const struct svc_cacherep *key, const struct svc_cacherep *rp) +static int +nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp) { - /* Check RPC XID first */ - if (key->c_xid != rp->c_xid) - return false; - /* compare checksum of NFS data */ - if (key->c_csum != rp->c_csum) { + if (key->c_key.k_xid == rp->c_key.k_xid && + key->c_key.k_csum != rp->c_key.k_csum) ++payload_misses; - return false; - } - /* Other discriminators */ - if (key->c_proc != rp->c_proc || - key->c_prot != rp->c_prot || - key->c_vers != rp->c_vers || - key->c_len != rp->c_len || - memcmp(&key->c_addr, &rp->c_addr, sizeof(key->c_addr)) != 0) - return false; - - return true; + return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key)); } /* @@ -367,7 +354,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) list_for_each_entry(rp, rh, c_lru) { ++entries; - if (nfsd_cache_match(key, rp)) { + if (nfsd_cache_key_cmp(key, rp) == 0) { ret = rp; break; } @@ -510,7 +497,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) if (!rp) return; - hash = nfsd_cache_hash(rp->c_xid); + hash = nfsd_cache_hash(rp->c_key.k_xid); b = &drc_hashtbl[hash]; len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); From 736c6625de666f3fd0b47428f10568154033151a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2018 10:41:57 -0400 Subject: [PATCH 22/30] knfsd: Improve lookup performance in the duplicate reply cache using an rbtree Use an rbtree to ensure the lookup/insert of an entry in a DRC bucket is O(log(N)). Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/cache.h | 1 + fs/nfsd/nfscache.c | 37 ++++++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 745c861237ca..4a98537efb0f 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -30,6 +30,7 @@ struct svc_cacherep { struct sockaddr_in6 k_addr; } c_key; + struct rb_node c_node; struct list_head c_lru; unsigned char c_state, /* unused, inprog, done */ c_type, /* status, buffer */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 230cc83921ad..e2fe0e9ce0df 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -30,6 +30,7 @@ #define TARGET_BUCKET_SIZE 64 struct nfsd_drc_bucket { + struct rb_root rb_head; struct list_head lru_head; spinlock_t cache_lock; }; @@ -129,6 +130,7 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) if (rp) { rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; + RB_CLEAR_NODE(&rp->c_node); INIT_LIST_HEAD(&rp->c_lru); memset(&rp->c_key, 0, sizeof(rp->c_key)); @@ -145,13 +147,14 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) } static void -nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); } if (rp->c_state != RC_UNUSED) { + rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&num_drc_entries); drc_mem_usage -= sizeof(*rp); @@ -163,7 +166,7 @@ static void nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); spin_unlock(&b->cache_lock); } @@ -219,7 +222,7 @@ void nfsd_reply_cache_shutdown(void) struct list_head *head = &drc_hashtbl[i].lru_head; while (!list_empty(head)) { rp = list_first_entry(head, struct svc_cacherep, c_lru); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp); } } @@ -258,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b) if (atomic_read(&num_drc_entries) <= max_drc_entries && time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); freed++; } return freed; @@ -349,17 +352,29 @@ static struct svc_cacherep * nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) { struct svc_cacherep *rp, *ret = key; - struct list_head *rh = &b->lru_head; + struct rb_node **p = &b->rb_head.rb_node, + *parent = NULL; unsigned int entries = 0; + int cmp; - list_for_each_entry(rp, rh, c_lru) { + while (*p != NULL) { ++entries; - if (nfsd_cache_key_cmp(key, rp) == 0) { + parent = *p; + rp = rb_entry(parent, struct svc_cacherep, c_node); + + cmp = nfsd_cache_key_cmp(key, rp); + if (cmp < 0) + p = &parent->rb_left; + else if (cmp > 0) + p = &parent->rb_right; + else { ret = rp; - break; + goto out; } } - + rb_link_node(&key->c_node, parent, p); + rb_insert_color(&key->c_node, &b->rb_head); +out: /* tally hash chain length stats */ if (entries > longest_chain) { longest_chain = entries; @@ -414,7 +429,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) spin_lock(&b->cache_lock); found = nfsd_cache_insert(b, rp); if (found != rp) { - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(NULL, rp); rp = found; goto found_entry; } @@ -462,7 +477,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) break; default: printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); } goto out; From 0ac203cb1f03734606c0674eded43aaefb5a491a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 2 Oct 2018 12:08:48 +0200 Subject: [PATCH 23/30] nfsd: fix fall-through annotations Replace "fallthru" with a proper "fall through" annotation. Also, add an annotation were it is expected to fall through. These fixes are part of the ongoing efforts to enabling -Wimplicit-fallthrough Signed-off-by: Gustavo A. R. Silva Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ddbac8776bd3..94412768b782 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1406,6 +1406,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = 1; break; } + /* fall through */ case NFS4_CREATE_EXCLUSIVE4_1: if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime && d_inode(dchild)->i_atime.tv_sec == v_atime @@ -1414,7 +1415,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = 1; goto set_attr; } - /* fallthru */ + /* fall through */ case NFS3_CREATE_GUARDED: err = nfserr_exist; } From f3c1fd0ee294abd4367dfa72d89f016c682202f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:15:56 -0400 Subject: [PATCH 24/30] svcrdma: Reduce max_send_sges There's no need to request a large number of send SGEs because the inline threshold already constrains the number of SGEs per Send. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2848cafd4a17..2f7ec8912f49 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -475,10 +475,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_send_sges = dev->attrs.max_send_sge; - /* transport hdr, head iovec, one page list entry, tail iovec */ - if (newxprt->sc_max_send_sges < 4) { - pr_err("svcrdma: too few Send SGEs available (%d)\n", + /* Transport header, head iovec, tail iovec */ + newxprt->sc_max_send_sges = 3; + /* Add one SGE per page list entry */ + newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE; + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) { + pr_err("svcrdma: too few Send SGEs available (%d needed)\n", newxprt->sc_max_send_sges); goto errout; } From 596f2a1950437095fee3062b75935c3bfdda0289 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:16:01 -0400 Subject: [PATCH 25/30] svcrdma: Remove ->release_rqst call in bc reply handler Similar to a change made in the client's forward channel reply handler: The xprt_release_rqst_cong() call is not necessary. Also, release xprt->recv_lock when taking xprt->transport_lock to avoid disabling and enabling BH's while holding another spin lock. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a68180090554..642f0310cc16 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -32,7 +32,6 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct kvec *dst, *src = &rcvbuf->head[0]; struct rpc_rqst *req; - unsigned long cwnd; u32 credits; size_t len; __be32 xid; @@ -66,6 +65,8 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (dst->iov_len < len) goto out_unlock; memcpy(dst->iov_base, p, len); + xprt_pin_rqst(req); + spin_unlock(&xprt->recv_lock); credits = be32_to_cpup(rdma_resp + 2); if (credits == 0) @@ -74,15 +75,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, credits = r_xprt->rx_buf.rb_bc_max_requests; spin_lock_bh(&xprt->transport_lock); - cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(req->rq_task); spin_unlock_bh(&xprt->transport_lock); - + spin_lock(&xprt->recv_lock); ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); + xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: From 07880fa4968b07db9df0f126bb7fbde245ed12ff Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:16:06 -0400 Subject: [PATCH 26/30] svcrdma: Remove try_module_get from backchannel Since commit ffe1f0df5862 ("rpcrdma: Merge svcrdma and xprtrdma modules into one"), the forward and backchannel components are part of the same kernel module. A separate try_module_get() call in the backchannel code is no longer necessary. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 642f0310cc16..7e188bc793a2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -5,8 +5,6 @@ * Support for backward direction RPCs on RPC/RDMA (server-side). */ -#include - #include #include "xprt_rdma.h" @@ -255,7 +253,6 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); xprt_free(xprt); - module_put(THIS_MODULE); } static const struct rpc_xprt_ops xprt_rdma_bc_procs = { @@ -327,20 +324,9 @@ xprt_setup_rdma_bc(struct xprt_create *args) args->bc_xprt->xpt_bc_xprt = xprt; xprt->bc_xprt = args->bc_xprt; - if (!try_module_get(THIS_MODULE)) - goto out_fail; - /* Final put for backchannel xprt is in __svc_rdma_free */ xprt_get(xprt); return xprt; - -out_fail: - xprt_rdma_free_addresses(xprt); - args->bc_xprt->xpt_bc_xprt = NULL; - args->bc_xprt->xpt_bc_xps = NULL; - xprt_put(xprt); - xprt_free(xprt); - return ERR_PTR(-EINVAL); } struct xprt_class xprt_rdma_bc = { From 3ae2cefb613b00d613677c05ffa384b4f660f468 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 1 Oct 2018 14:16:11 -0400 Subject: [PATCH 27/30] svcrdma: Increase the default connection credit limit Reduce queuing on clients by allowing more credits by default. 64 is the default NFSv4.1 slot table size on Linux clients. This size prevents the credit limit from putting RPC requests to sleep again after they have already slept waiting for a session slot. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index fd78f78df5c6..e6e26918504c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -113,13 +113,14 @@ struct svcxprt_rdma { /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 -#define RPCRDMA_LISTEN_BACKLOG 10 -#define RPCRDMA_MAX_REQUESTS 32 - -/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our - * current NFSv4.1 implementation supports one backchannel slot. +/* + * Default connection parameters */ -#define RPCRDMA_MAX_BC_REQUESTS 2 +enum { + RPCRDMA_LISTEN_BACKLOG = 10, + RPCRDMA_MAX_REQUESTS = 64, + RPCRDMA_MAX_BC_REQUESTS = 2, +}; #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD From bd8d725078867cda250fe94b9c5a067b4a64ca74 Mon Sep 17 00:00:00 2001 From: Andrew Elble Date: Fri, 5 Oct 2018 09:32:08 -0400 Subject: [PATCH 28/30] nfsd: correctly decrement odstate refcount in error path alloc_init_deleg() both allocates an nfs4_delegation, and bumps the refcount on odstate. So after this point, we need to put_clnt_odstate() and nfs4_put_stid() to not leave the odstate refcount inappropriately bumped. Signed-off-by: Andrew Elble Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 07a57d024f95..f093fbe47133 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4398,7 +4398,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); if (!fl) - goto out_stid; + goto out_clnt_odstate; status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); if (fl) @@ -4423,7 +4423,6 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); -out_stid: nfs4_put_stid(&dp->dl_stid); out_delegees: put_deleg_file(fp); From bb6ad5572c0022e17e846b382d7413cdcf8055be Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Oct 2018 15:54:15 -0400 Subject: [PATCH 29/30] nfsd: Fix an Oops in free_session() In call_xpt_users(), we delete the entry from the list, but we do not reinitialise it. This triggers the list poisoning when we later call unregister_xpt_user() in nfsd4_del_conns(). Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5185efb9027b..83ccd0221c98 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -989,7 +989,7 @@ static void call_xpt_users(struct svc_xprt *xprt) spin_lock(&xprt->xpt_lock); while (!list_empty(&xprt->xpt_users)) { u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list); - list_del(&u->list); + list_del_init(&u->list); u->callback(u); } spin_unlock(&xprt->xpt_lock); From 93f38b6fae0ea8987e22d9e6c38f8dfdccd867ee Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 28 Sep 2018 20:41:48 +0300 Subject: [PATCH 30/30] lockd: fix access beyond unterminated strings in prints printk format used %*s instead of %.*s, so hostname_len does not limit the number of bytes accessed from hostname. Signed-off-by: Amir Goldstein Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index d35cd6be0675..93fb7cf0b92b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -341,7 +341,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, }; struct lockd_net *ln = net_generic(net, lockd_net_id); - dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + dprintk("lockd: %s(host='%.*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));