From 20509f1bc553ed7fafa88fa8d01c6212d1876d9f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Aug 2005 16:25:34 -0700 Subject: [PATCH 001/126] NFS: Drop inode after rename When doing a rename on top of an existing file that is not in use, the inode of the overwritten file will remain in the icache. The fix is to decrement i_nlink of the overwritten inode, like we do for unlink, rmdir etc already. Problem diagnosed by Olaf Kirch. This patch is a slight variation on his fix. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2df639f143e8..94a7fcee0624 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1539,7 +1539,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, #endif goto out; } - } + } else + new_inode->i_nlink--; go_ahead: /* From 449231d6ddf50ca46b7fb2f76ecf790135222913 Mon Sep 17 00:00:00 2001 From: Olaf Kirch Date: Thu, 25 Aug 2005 16:25:35 -0700 Subject: [PATCH 002/126] From: Olaf Kirch [PATCH] Fix miscompare in __posix_lock_file If an application requests the same lock twice, the kernel should just leave the existing lock in place. Currently, it will install a second lock of the same type. Signed-off-by: Olaf Kirch Signed-off-by: Trond Myklebust --- fs/locks.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index f7daa5f48949..7eb1d77b9204 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -829,12 +829,16 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request) /* Detect adjacent or overlapping regions (if same lock type) */ if (request->fl_type == fl->fl_type) { + /* In all comparisons of start vs end, use + * "start - 1" rather than "end + 1". If end + * is OFFSET_MAX, end + 1 will become negative. + */ if (fl->fl_end < request->fl_start - 1) goto next_lock; /* If the next lock in the list has entirely bigger * addresses than the new one, insert the lock here. */ - if (fl->fl_start > request->fl_end + 1) + if (fl->fl_start - 1 > request->fl_end) break; /* If we come here, the new and old lock are of the From 9aa48b7e270d13c8781414dce081a42cae20a80d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Aug 2005 16:25:35 -0700 Subject: [PATCH 003/126] NFS: Don't expose internal READDIR errors to userspace Fixes a condition whereby the kernel is returning the non-POSIX error EBADCOOKIE to userspace. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 94a7fcee0624..c70eabd6d179 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -565,8 +565,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } } unlock_kernel(); - if (desc->error < 0) - return desc->error; if (res < 0) return res; return 0; From 23475d66bd8600e0c5353f86c1b74f68df27bdb5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:08 -0400 Subject: [PATCH 004/126] [PATCH] RPC: Report connection errors properly when mounting with "soft" Fix up xprt_connect_status: the soft timeout logic was clobbering tk_status, so TCP connect errors were not properly reported on soft mounts. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Version: Thu, 11 Aug 2005 16:01:28 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3c654e06b084..b28ea0cc0cb7 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -592,24 +592,33 @@ xprt_connect_status(struct rpc_task *task) return; } - /* if soft mounted, just cause this RPC to fail */ - if (RPC_IS_SOFT(task)) - task->tk_status = -EIO; - switch (task->tk_status) { case -ECONNREFUSED: case -ECONNRESET: + dprintk("RPC: %4d xprt_connect_status: server %s refused connection\n", + task->tk_pid, task->tk_client->cl_server); + break; case -ENOTCONN: - return; + dprintk("RPC: %4d xprt_connect_status: connection broken\n", + task->tk_pid); + break; case -ETIMEDOUT: - dprintk("RPC: %4d xprt_connect_status: timed out\n", + dprintk("RPC: %4d xprt_connect_status: connect attempt timed out\n", task->tk_pid); break; default: - printk(KERN_ERR "RPC: error %d connecting to server %s\n", - -task->tk_status, task->tk_client->cl_server); + dprintk("RPC: %4d xprt_connect_status: error %d connecting to server %s\n", + task->tk_pid, -task->tk_status, task->tk_client->cl_server); + xprt_release_write(xprt, task); + task->tk_status = -EIO; + return; + } + + /* if soft mounted, just cause this RPC to fail */ + if (RPC_IS_SOFT(task)) { + xprt_release_write(xprt, task); + task->tk_status = -EIO; } - xprt_release_write(xprt, task); } /* From da35187801732397a7e05fb9e77f3700cc35f5db Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:11 -0400 Subject: [PATCH 005/126] [PATCH] RPC: proper soft timeout behavior for rpcbind Implement a best practice: for soft mounts, an rpcbind timeout should cause an RPC request to fail. This also provides an FSM hook for retrying an rpcbind with a different rpcbind protocol version. We'll use this later to try multiple rpcbind protocol versions when binding. To enable this, expose the RPC error code returned during a portmap request to the FSM so it can make some decision about how to report, retry, or fail the request. Test-plan: Hundreds of passes with connectathon NFSv3 locking suite, on the client and server. Version: Thu, 11 Aug 2005 16:01:53 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 97 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f17e6153b688..2d3cf0a52d82 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -53,6 +53,7 @@ static void call_allocate(struct rpc_task *task); static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); +static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); @@ -734,43 +735,94 @@ static void call_bind(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; - dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, - xprt, (xprt_connected(xprt) ? "is" : "is not")); - - task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect; + dprintk("RPC: %4d call_bind (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_action = call_connect; if (!clnt->cl_port) { - task->tk_action = call_connect; + task->tk_action = call_bind_status; task->tk_timeout = RPC_CONNECT_TIMEOUT; rpc_getport(task, clnt); } } /* - * 4a. Connect to the RPC server (TCP case) + * 4a. Sort out bind result + */ +static void +call_bind_status(struct rpc_task *task) +{ + int status = -EACCES; + + if (task->tk_status >= 0) { + dprintk("RPC: %4d call_bind_status (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_status = 0; + task->tk_action = call_connect; + return; + } + + switch (task->tk_status) { + case -EACCES: + dprintk("RPC: %4d remote rpcbind: RPC program/version unavailable\n", + task->tk_pid); + break; + case -ETIMEDOUT: + dprintk("RPC: %4d rpcbind request timed out\n", + task->tk_pid); + if (RPC_IS_SOFT(task)) { + status = -EIO; + break; + } + goto retry_bind; + case -EPFNOSUPPORT: + dprintk("RPC: %4d remote rpcbind service unavailable\n", + task->tk_pid); + break; + case -EPROTONOSUPPORT: + dprintk("RPC: %4d remote rpcbind version 2 unavailable\n", + task->tk_pid); + break; + default: + dprintk("RPC: %4d unrecognized rpcbind error (%d)\n", + task->tk_pid, -task->tk_status); + status = -EIO; + break; + } + + rpc_exit(task, status); + return; + +retry_bind: + task->tk_status = 0; + task->tk_action = call_bind; + return; +} + +/* + * 4b. Connect to the RPC server */ static void call_connect(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = task->tk_xprt; - dprintk("RPC: %4d call_connect status %d\n", - task->tk_pid, task->tk_status); + dprintk("RPC: %4d call_connect xprt %p %s connected\n", + task->tk_pid, xprt, + (xprt_connected(xprt) ? "is" : "is not")); - if (xprt_connected(clnt->cl_xprt)) { - task->tk_action = call_transmit; - return; + task->tk_action = call_transmit; + if (!xprt_connected(xprt)) { + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); } - task->tk_action = call_connect_status; - if (task->tk_status < 0) - return; - xprt_connect(task); } /* - * 4b. Sort out connect result + * 4c. Sort out connect result */ static void call_connect_status(struct rpc_task *task) @@ -778,6 +830,9 @@ call_connect_status(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; + dprintk("RPC: %5u call_connect_status (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_status = 0; if (status >= 0) { clnt->cl_stats->netreconn++; @@ -785,17 +840,19 @@ call_connect_status(struct rpc_task *task) return; } - /* Something failed: we may have to rebind */ + /* Something failed: remote service port may have changed */ if (clnt->cl_autobind) clnt->cl_port = 0; + switch (status) { case -ENOTCONN: case -ETIMEDOUT: case -EAGAIN: - task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect; + task->tk_action = call_bind; break; default: rpc_exit(task, -EIO); + break; } } From eab5c084b858fd95a873fc2b97de9a9ad937b4ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:14 -0400 Subject: [PATCH 006/126] [PATCH] NFS: use a constant value for TCP retransmit timeouts Implement a best practice: don't use exponential backoff when computing retransmit timeout values on TCP connections, but simply retransmit at regular intervals. This also fixes a bug introduced when xprt_reset_majortimeo() was added. Test-plan: Enable RPC debugging and watch timeout behavior on a NFS/TCP mount. Version: Thu, 11 Aug 2005 16:02:19 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 73 +++++++++++++++++++++++------------------------ net/sunrpc/xprt.c | 4 +-- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6922469d6fc5..b6a1ca508e60 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -358,6 +358,35 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) return no_root_error; } +static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans) +{ + to->to_initval = timeo * HZ / 10; + to->to_retries = retrans; + if (!to->to_retries) + to->to_retries = 2; + + switch (proto) { + case IPPROTO_TCP: + if (!to->to_initval) + to->to_initval = 60 * HZ; + if (to->to_initval > RPC_MAX_TCP_TIMEOUT) + to->to_initval = RPC_MAX_TCP_TIMEOUT; + to->to_increment = to->to_initval; + to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); + to->to_exponential = 0; + break; + case IPPROTO_UDP: + default: + if (!to->to_initval) + to->to_initval = 11 * HZ / 10; + if (to->to_initval > RPC_MAX_UDP_TIMEOUT) + to->to_initval = RPC_MAX_UDP_TIMEOUT; + to->to_maxval = RPC_MAX_UDP_TIMEOUT; + to->to_exponential = 1; + break; + } +} + /* * Create an RPC client handle. */ @@ -367,22 +396,12 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) struct rpc_timeout timeparms; struct rpc_xprt *xprt = NULL; struct rpc_clnt *clnt = NULL; - int tcp = (data->flags & NFS_MOUNT_TCP); + int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; - /* Initialize timeout values */ - timeparms.to_initval = data->timeo * HZ / 10; - timeparms.to_retries = data->retrans; - timeparms.to_maxval = tcp ? RPC_MAX_TCP_TIMEOUT : RPC_MAX_UDP_TIMEOUT; - timeparms.to_exponential = 1; - - if (!timeparms.to_initval) - timeparms.to_initval = (tcp ? 600 : 11) * HZ / 10; - if (!timeparms.to_retries) - timeparms.to_retries = 5; + nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans); /* create transport and client */ - xprt = xprt_create_proto(tcp ? IPPROTO_TCP : IPPROTO_UDP, - &server->addr, &timeparms); + xprt = xprt_create_proto(proto, &server->addr, &timeparms); if (IS_ERR(xprt)) { dprintk("%s: cannot create RPC transport. Error = %ld\n", __FUNCTION__, PTR_ERR(xprt)); @@ -1674,7 +1693,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, struct rpc_clnt *clnt = NULL; struct rpc_timeout timeparms; rpc_authflavor_t authflavour; - int proto, err = -EIO; + int err = -EIO; sb->s_blocksize_bits = 0; sb->s_blocksize = 0; @@ -1692,30 +1711,8 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, server->acdirmax = data->acdirmax*HZ; server->rpc_ops = &nfs_v4_clientops; - /* Initialize timeout values */ - timeparms.to_initval = data->timeo * HZ / 10; - timeparms.to_retries = data->retrans; - timeparms.to_exponential = 1; - if (!timeparms.to_retries) - timeparms.to_retries = 5; - - proto = data->proto; - /* Which IP protocol do we use? */ - switch (proto) { - case IPPROTO_TCP: - timeparms.to_maxval = RPC_MAX_TCP_TIMEOUT; - if (!timeparms.to_initval) - timeparms.to_initval = 600 * HZ / 10; - break; - case IPPROTO_UDP: - timeparms.to_maxval = RPC_MAX_UDP_TIMEOUT; - if (!timeparms.to_initval) - timeparms.to_initval = 11 * HZ / 10; - break; - default: - return -EINVAL; - } + nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans); clp = nfs4_get_client(&server->addr.sin_addr); if (!clp) { @@ -1740,7 +1737,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, down_write(&clp->cl_sem); if (IS_ERR(clp->cl_rpcclient)) { - xprt = xprt_create_proto(proto, &server->addr, &timeparms); + xprt = xprt_create_proto(data->proto, &server->addr, &timeparms); if (IS_ERR(xprt)) { up_write(&clp->cl_sem); err = PTR_ERR(xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index b28ea0cc0cb7..0e4ffdaa0129 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1453,7 +1453,7 @@ xprt_default_timeout(struct rpc_timeout *to, int proto) if (proto == IPPROTO_UDP) xprt_set_timeout(to, 5, 5 * HZ); else - xprt_set_timeout(to, 5, 60 * HZ); + xprt_set_timeout(to, 2, 60 * HZ); } /* @@ -1464,7 +1464,7 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) { to->to_initval = to->to_increment = incr; - to->to_maxval = incr * retr; + to->to_maxval = to->to_initval + (incr * retr); to->to_retries = retr; to->to_exponential = 0; } From 602f83273c89fdd25f24757564d8001cf723e740 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:17 -0400 Subject: [PATCH 007/126] [PATCH] RPC: portmapper doesn't need a reserved port The in-kernel portmapper does not require a reserved port for making bind queries. Test-plan: Tens of runs of the Connectathon locking suite with TCP and UDP against several other NFS server implementations using NFSv3, not NFSv4 (which doesn't require rpcbind). Version: Thu, 11 Aug 2005 16:02:43 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/pmap_clnt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index 4e81f2766923..d8e3f220002b 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -208,6 +208,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; xprt->addr.sin_port = htons(RPC_PMAP_PORT); + xprt->resvport = 0; /* printk("pmap: create clnt\n"); */ clnt = rpc_new_client(xprt, hostname, From 094bb20b9fcab3a1652a77741caba6b78097d622 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:20 -0400 Subject: [PATCH 008/126] [PATCH] RPC: extract socket logic common to both client and server Clean-up: Move some code that is common to both RPC client- and server-side socket transports into its own source file, net/sunrpc/socklib.c. Test-plan: Compile kernel with CONFIG_NFS enabled. Millions of fsx operations over UDP, client and server. Connectathon over UDP. Version: Thu, 11 Aug 2005 16:03:09 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/Makefile | 2 +- net/sunrpc/socklib.c | 175 +++++++++++++++++++++++++++++++++++++ net/sunrpc/svcsock.c | 3 - net/sunrpc/xdr.c | 75 ---------------- net/sunrpc/xprt.c | 64 -------------- 6 files changed, 177 insertions(+), 143 deletions(-) create mode 100644 net/sunrpc/socklib.c diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 23448d0fb5bc..d8b7656bca41 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -161,6 +161,7 @@ typedef struct { typedef size_t (*skb_read_actor_t)(skb_reader_t *desc, void *to, size_t len); +extern int csum_partial_copy_to_xdr(struct xdr_buf *, struct sk_buff *); extern ssize_t xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int, skb_reader_t *, skb_read_actor_t); diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 46a2ce00a29b..f0a955627177 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -sunrpc-y := clnt.o xprt.o sched.o \ +sunrpc-y := clnt.o xprt.o socklib.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ pmap_clnt.o timer.o xdr.o \ diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c new file mode 100644 index 000000000000..8f97e90f36c8 --- /dev/null +++ b/net/sunrpc/socklib.c @@ -0,0 +1,175 @@ +/* + * linux/net/sunrpc/socklib.c + * + * Common socket helper routines for RPC client and server + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include + + +/** + * skb_read_bits - copy some data bits from skb to internal buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Possibly called several times to iterate over an sk_buff and copy + * data out of it. + */ +static size_t skb_read_bits(skb_reader_t *desc, void *to, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, to, len)) + return 0; + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * skb_read_and_csum_bits - copy and checksum from skb to buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Same as skb_read_bits, but calculate a checksum at the same time. + */ +static size_t skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) +{ + unsigned int csum2, pos; + + if (len > desc->count) + len = desc->count; + pos = desc->offset; + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + desc->csum = csum_block_add(desc->csum, csum2, pos); + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * xdr_partial_copy_from_skb - copy data out of an skb + * @xdr: target XDR buffer + * @base: starting offset + * @desc: sk_buff copy helper + * @copy_actor: virtual method for copying data + * + */ +ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, skb_reader_t *desc, skb_read_actor_t copy_actor) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + ssize_t copied = 0; + int ret; + + len = xdr->head[0].iov_len; + if (base < len) { + len -= base; + ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); + copied += ret; + if (ret != len || !desc->count) + goto out; + base = 0; + } else + base -= len; + + if (unlikely(pglen == 0)) + goto copy_tail; + if (unlikely(base >= pglen)) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + do { + char *kaddr; + + /* ACL likes to be lazy in allocating pages - ACLs + * are small by default but can get huge. */ + if (unlikely(*ppage == NULL)) { + *ppage = alloc_page(GFP_ATOMIC); + if (unlikely(*ppage == NULL)) { + if (copied == 0) + copied = -ENOMEM; + goto out; + } + } + + len = PAGE_CACHE_SIZE; + kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); + if (base) { + len -= base; + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr + base, len); + base = 0; + } else { + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr, len); + } + flush_dcache_page(*ppage); + kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); + copied += ret; + if (ret != len || !desc->count) + goto out; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) + copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); +out: + return copied; +} + +/** + * csum_partial_copy_to_xdr - checksum and copy data + * @xdr: target XDR buffer + * @skb: source skb + * + * We have set things up such that we perform the checksum of the UDP + * packet in parallel with the copies into the RPC client iovec. -DaveM + */ +int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + skb_reader_t desc; + + desc.skb = skb; + desc.offset = sizeof(struct udphdr); + desc.count = skb->len - desc.offset; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) + goto no_checksum; + + desc.csum = csum_partial(skb->data, desc.offset, skb->csum); + if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0) + return -1; + if (desc.offset != skb->len) { + unsigned int csum2; + csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); + desc.csum = csum_block_add(desc.csum, csum2, desc.offset); + } + if (desc.count) + return -1; + if ((unsigned short)csum_fold(desc.csum)) + return -1; + return 0; +no_checksum: + if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) + return -1; + if (desc.count) + return -1; + return 0; +} diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 30ec3efc48a6..130f2b5d93dd 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -548,9 +548,6 @@ svc_write_space(struct sock *sk) /* * Receive a datagram from a UDP socket. */ -extern int -csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); - static int svc_udp_recvfrom(struct svc_rqst *rqstp) { diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fde16f40a581..9cc12aeed22c 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -176,81 +176,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, xdr->buflen += len; } -ssize_t -xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, - skb_reader_t *desc, - skb_read_actor_t copy_actor) -{ - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - ssize_t copied = 0; - int ret; - - len = xdr->head[0].iov_len; - if (base < len) { - len -= base; - ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); - copied += ret; - if (ret != len || !desc->count) - goto out; - base = 0; - } else - base -= len; - - if (pglen == 0) - goto copy_tail; - if (base >= pglen) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } - do { - char *kaddr; - - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { - *ppage = alloc_page(GFP_ATOMIC); - if (unlikely(*ppage == NULL)) { - if (copied == 0) - copied = -ENOMEM; - goto out; - } - } - - len = PAGE_CACHE_SIZE; - kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); - if (base) { - len -= base; - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr + base, len); - base = 0; - } else { - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr, len); - } - flush_dcache_page(*ppage); - kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); - copied += ret; - if (ret != len || !desc->count) - goto out; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) - copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); -out: - return copied; -} - int xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0e4ffdaa0129..67444f494fea 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -691,70 +691,6 @@ xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -static size_t -skb_read_bits(skb_reader_t *desc, void *to, size_t len) -{ - if (len > desc->count) - len = desc->count; - if (skb_copy_bits(desc->skb, desc->offset, to, len)) - return 0; - desc->count -= len; - desc->offset += len; - return len; -} - -static size_t -skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) -{ - unsigned int csum2, pos; - - if (len > desc->count) - len = desc->count; - pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); - desc->csum = csum_block_add(desc->csum, csum2, pos); - desc->count -= len; - desc->offset += len; - return len; -} - -/* - * We have set things up such that we perform the checksum of the UDP - * packet in parallel with the copies into the RPC client iovec. -DaveM - */ -int -csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - skb_reader_t desc; - - desc.skb = skb; - desc.offset = sizeof(struct udphdr); - desc.count = skb->len - desc.offset; - - if (skb->ip_summed == CHECKSUM_UNNECESSARY) - goto no_checksum; - - desc.csum = csum_partial(skb->data, desc.offset, skb->csum); - if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0) - return -1; - if (desc.offset != skb->len) { - unsigned int csum2; - csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); - desc.csum = csum_block_add(desc.csum, csum2, desc.offset); - } - if (desc.count) - return -1; - if ((unsigned short)csum_fold(desc.csum)) - return -1; - return 0; -no_checksum: - if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - /* * Input handler for RPC replies. Called from a bottom half and hence * atomic. From a246b0105bbd9a70a698f69baae2042996f2a0e9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:23 -0400 Subject: [PATCH 009/126] [PATCH] RPC: introduce client-side transport switch Move the bulk of client-side socket-specific code into a separate source file, net/sunrpc/xprtsock.c. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Destructive testing (unplugging the network temporarily, server reboots). Connectathon with v2, v3, and v4. Version: Thu, 11 Aug 2005 16:03:38 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 5 - include/linux/sunrpc/xprt.h | 38 +- net/sunrpc/Makefile | 2 +- net/sunrpc/clnt.c | 3 +- net/sunrpc/sysctl.c | 3 + net/sunrpc/xdr.c | 102 +--- net/sunrpc/xprt.c | 916 ++----------------------------- net/sunrpc/xprtsock.c | 1021 +++++++++++++++++++++++++++++++++++ 8 files changed, 1101 insertions(+), 989 deletions(-) create mode 100644 net/sunrpc/xprtsock.c diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index d8b7656bca41..5da968729cf8 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -165,11 +165,6 @@ extern int csum_partial_copy_to_xdr(struct xdr_buf *, struct sk_buff *); extern ssize_t xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int, skb_reader_t *, skb_read_actor_t); -struct socket; -struct sockaddr; -extern int xdr_sendpages(struct socket *, struct sockaddr *, int, - struct xdr_buf *, unsigned int, int); - extern int xdr_encode_word(struct xdr_buf *, int, u32); extern int xdr_decode_word(struct xdr_buf *, int, u32 *); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index e618c1649814..d82b47ab73cb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -59,7 +59,13 @@ extern unsigned int xprt_tcp_slot_table_entries; */ #define RPC_REESTABLISH_TIMEOUT (15*HZ) -/* RPC call and reply header size as number of 32bit words (verifier +/* + * RPC transport idle timeout. + */ +#define RPC_IDLE_DISCONNECT_TIMEOUT (5*60*HZ) + +/* + * RPC call and reply header size as number of 32bit words (verifier * size computed separately) */ #define RPC_CALLHDRSIZE 6 @@ -121,12 +127,19 @@ struct rpc_rqst { #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len -#define XPRT_LAST_FRAG (1 << 0) -#define XPRT_COPY_RECM (1 << 1) -#define XPRT_COPY_XID (1 << 2) -#define XPRT_COPY_DATA (1 << 3) +struct rpc_task; +struct rpc_xprt; + +struct rpc_xprt_ops { + void (*set_buffer_size)(struct rpc_xprt *xprt); + void (*connect)(struct rpc_task *task); + int (*send_request)(struct rpc_task *task); + void (*close)(struct rpc_xprt *xprt); + void (*destroy)(struct rpc_xprt *xprt); +}; struct rpc_xprt { + struct rpc_xprt_ops * ops; /* transport methods */ struct socket * sock; /* BSD socket layer */ struct sock * inet; /* INET layer */ @@ -199,14 +212,22 @@ struct rpc_xprt { wait_queue_head_t cong_wait; }; +#define XPRT_LAST_FRAG (1 << 0) +#define XPRT_COPY_RECM (1 << 1) +#define XPRT_COPY_XID (1 << 2) +#define XPRT_COPY_DATA (1 << 3) + #ifdef __KERNEL__ struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, struct rpc_timeout *toparms); +void xprt_disconnect(struct rpc_xprt *); int xprt_destroy(struct rpc_xprt *); void xprt_set_timeout(struct rpc_timeout *, unsigned int, unsigned long); - +struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *, u32); +void xprt_complete_rqst(struct rpc_xprt *, + struct rpc_rqst *, int); void xprt_reserve(struct rpc_task *); int xprt_prepare_transmit(struct rpc_task *); void xprt_transmit(struct rpc_task *); @@ -214,7 +235,10 @@ void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release(struct rpc_task *); void xprt_connect(struct rpc_task *); -void xprt_sock_setbufsize(struct rpc_xprt *); +int xs_setup_udp(struct rpc_xprt *, + struct rpc_timeout *); +int xs_setup_tcp(struct rpc_xprt *, + struct rpc_timeout *); #define XPRT_LOCKED 0 #define XPRT_CONNECT 1 diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index f0a955627177..cdcab9ca4c60 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -sunrpc-y := clnt.o xprt.o socklib.o sched.o \ +sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ pmap_clnt.o timer.o xdr.o \ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2d3cf0a52d82..ab50c3c9e6a8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -525,8 +525,7 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize xprt->rcvsize = 0; if (rcvsize) xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; - if (xprt_connected(xprt)) - xprt_sock_setbufsize(xprt); + xprt->ops->set_buffer_size(xprt); } /* diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 1b9616a12e24..ef483262f17f 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -119,6 +119,9 @@ proc_dodebug(ctl_table *table, int write, struct file *file, return 0; } +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 9cc12aeed22c..32df43372ee9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -6,15 +6,12 @@ * Copyright (C) 1995, 1996 Olaf Kirch */ +#include #include -#include #include #include #include #include -#include -#include -#include #include #include @@ -177,103 +174,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, } -int -xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, - struct xdr_buf *xdr, unsigned int base, int msgflags) -{ - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - int err, ret = 0; - ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); - - len = xdr->head[0].iov_len; - if (base < len || (addr != NULL && base == 0)) { - struct kvec iov = { - .iov_base = xdr->head[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = msgflags, - }; - if (xdr->len > len) - msg.msg_flags |= MSG_MORE; - - if (iov.iov_len != 0) - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - else - err = kernel_sendmsg(sock, &msg, NULL, 0, 0); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != iov.iov_len) - goto out; - base = 0; - } else - base -= len; - - if (pglen == 0) - goto copy_tail; - if (base >= pglen) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } - - sendpage = sock->ops->sendpage ? : sock_no_sendpage; - do { - int flags = msgflags; - - len = PAGE_CACHE_SIZE; - if (base) - len -= base; - if (pglen < len) - len = pglen; - - if (pglen != len || xdr->tail[0].iov_len != 0) - flags |= MSG_MORE; - - /* Hmm... We might be dealing with highmem pages */ - if (PageHighMem(*ppage)) - sendpage = sock_no_sendpage; - err = sendpage(sock, *ppage, base, len, flags); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != len) - goto out; - base = 0; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) { - struct kvec iov = { - .iov_base = xdr->tail[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_flags = msgflags, - }; - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - } -out: - return ret; -} - - /* * Helper routines for doing 'memmove' like operations on a struct xdr_buf * diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 67444f494fea..4342acf4d1cd 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -32,37 +32,16 @@ * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch - * - * TCP callback races fixes (C) 1998 Red Hat Software - * TCP send fixes (C) 1998 Red Hat Software - * TCP NFS related read + write fixes - * (C) 1999 Dave Airlie, University of Limerick, Ireland - * - * Rewrite of larges part of the code in order to stabilize TCP stuff. - * Fix behaviour when socket buffer is full. - * (C) 1999 Trond Myklebust */ +#include + #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include -#include -#include -#include -#include +#include /* * Local variables @@ -74,64 +53,17 @@ #endif #define XPRT_MAX_BACKOFF (8) -#define XPRT_IDLE_TIMEOUT (5*60*HZ) -#define XPRT_MAX_RESVPORT (800) /* * Local functions */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static inline void do_xprt_reserve(struct rpc_task *); -static void xprt_disconnect(struct rpc_xprt *); static void xprt_connect_status(struct rpc_task *task); -static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, - struct rpc_timeout *to); -static struct socket *xprt_create_socket(struct rpc_xprt *, int, int); -static void xprt_bind_socket(struct rpc_xprt *, struct socket *); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); -#ifdef RPC_DEBUG_DATA -/* - * Print the buffer contents (first 128 bytes only--just enough for - * diropres return). - */ -static void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) -{ - u8 *buf = (u8 *) packet; - int j; - - dprintk("RPC: %s\n", msg); - for (j = 0; j < count && j < 128; j += 4) { - if (!(j & 31)) { - if (j) - dprintk("\n"); - dprintk("0x%04x ", j); - } - dprintk("%02x%02x%02x%02x ", - buf[j], buf[j+1], buf[j+2], buf[j+3]); - } - dprintk("\n"); -} -#else -static inline void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) -{ - /* NOP */ -} -#endif - -/* - * Look up RPC transport given an INET socket - */ -static inline struct rpc_xprt * -xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - /* * Serialize write access to sockets, in order to prevent different * requests from interfering with each other. @@ -234,62 +166,6 @@ xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock_bh(&xprt->sock_lock); } -/* - * Write data to socket. - */ -static inline int -xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) -{ - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; - struct sockaddr *addr = NULL; - int addrlen = 0; - unsigned int skip; - int result; - - if (!sock) - return -ENOTCONN; - - xprt_pktdump("packet data:", - req->rq_svec->iov_base, - req->rq_svec->iov_len); - - /* For UDP, we need to provide an address */ - if (!xprt->stream) { - addr = (struct sockaddr *) &xprt->addr; - addrlen = sizeof(xprt->addr); - } - /* Dont repeat bytes */ - skip = req->rq_bytes_sent; - - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); - - dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); - - if (result >= 0) - return result; - - switch (result) { - case -ECONNREFUSED: - /* When the server has died, an ICMP port unreachable message - * prompts ECONNREFUSED. - */ - case -EAGAIN: - break; - case -ECONNRESET: - case -ENOTCONN: - case -EPIPE: - /* connection broken */ - if (xprt->stream) - result = -ENOTCONN; - break; - default: - printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); - } - return result; -} - /* * Van Jacobson congestion avoidance. Check if the congestion window * overflowed. Put the task to sleep if this is the case. @@ -405,48 +281,20 @@ int xprt_adjust_timeout(struct rpc_rqst *req) return status; } -/* - * Close down a transport socket - */ -static void -xprt_close(struct rpc_xprt *xprt) -{ - struct socket *sock = xprt->sock; - struct sock *sk = xprt->inet; - - if (!sk) - return; - - write_lock_bh(&sk->sk_callback_lock); - xprt->inet = NULL; - xprt->sock = NULL; - - sk->sk_user_data = NULL; - sk->sk_data_ready = xprt->old_data_ready; - sk->sk_state_change = xprt->old_state_change; - sk->sk_write_space = xprt->old_write_space; - write_unlock_bh(&sk->sk_callback_lock); - - sk->sk_no_check = 0; - - sock_release(sock); -} - static void xprt_socket_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; xprt_disconnect(xprt); - xprt_close(xprt); + xprt->ops->close(xprt); xprt_release_write(xprt, NULL); } /* * Mark a transport as disconnected */ -static void -xprt_disconnect(struct rpc_xprt *xprt) +void xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->sock_lock); @@ -479,57 +327,6 @@ xprt_init_autodisconnect(unsigned long data) spin_unlock(&xprt->sock_lock); } -static void xprt_socket_connect(void *args) -{ - struct rpc_xprt *xprt = (struct rpc_xprt *)args; - struct socket *sock = xprt->sock; - int status = -EIO; - - if (xprt->shutdown || xprt->addr.sin_port == 0) - goto out; - - /* - * Start by resetting any existing state - */ - xprt_close(xprt); - sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); - if (sock == NULL) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ - goto out; - } - xprt_bind_socket(xprt, sock); - xprt_sock_setbufsize(xprt); - - status = 0; - if (!xprt->stream) - goto out; - - /* - * Tell the socket layer to start connecting... - */ - status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, - sizeof(xprt->addr), O_NONBLOCK); - dprintk("RPC: %p connect status %d connected %d sock state %d\n", - xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - } - } -out: - if (status < 0) - rpc_wake_up_status(&xprt->pending, status); - else - rpc_wake_up(&xprt->pending); -out_clear: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTING, &xprt->sockstate); - smp_mb__after_clear_bit(); -} - /* * Attempt to connect a TCP socket. * @@ -552,30 +349,16 @@ void xprt_connect(struct rpc_task *task) if (!xprt_lock_write(xprt, task)) return; if (xprt_connected(xprt)) - goto out_write; + xprt_release_write(xprt, task); + else { + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; - if (task->tk_rqstp) - task->tk_rqstp->rq_bytes_sent = 0; - - task->tk_timeout = RPC_CONNECT_TIMEOUT; - rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); - if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { - /* Note: if we are here due to a dropped connection - * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ - * seconds - */ - if (xprt->sock != NULL) - schedule_delayed_work(&xprt->sock_connect, - RPC_REESTABLISH_TIMEOUT); - else { - schedule_work(&xprt->sock_connect); - if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); - } + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + xprt->ops->connect(task); } return; - out_write: - xprt_release_write(xprt, task); } /* @@ -624,8 +407,7 @@ xprt_connect_status(struct rpc_task *task) /* * Look up the RPC request corresponding to a reply, and then lock it. */ -static inline struct rpc_rqst * -xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) +struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) { struct list_head *pos; struct rpc_rqst *req = NULL; @@ -644,8 +426,7 @@ xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) * Complete reply received. * The TCP code relies on us to remove the request from xprt->pending. */ -static void -xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) +void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) { struct rpc_task *task = req->rq_task; struct rpc_clnt *clnt = task->tk_client; @@ -691,409 +472,6 @@ xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -/* - * Input handler for RPC replies. Called from a bottom half and hence - * atomic. - */ -static void -udp_data_ready(struct sock *sk, int len) -{ - struct rpc_task *task; - struct rpc_xprt *xprt; - struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; - u32 _xid, *xp; - - read_lock(&sk->sk_callback_lock); - dprintk("RPC: udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: udp_data_ready request not found!\n"); - goto out; - } - - dprintk("RPC: udp_data_ready client %p\n", xprt); - - if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) - goto out; - - if (xprt->shutdown) - goto dropit; - - repsize = skb->len - sizeof(struct udphdr); - if (repsize < 4) { - printk("RPC: impossible RPC reply size %d!\n", repsize); - goto dropit; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(struct udphdr), - sizeof(_xid), &_xid); - if (xp == NULL) - goto dropit; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->sock_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - task = rovr->rq_task; - - dprintk("RPC: %4d received reply\n", task->tk_pid); - - if ((copied = rovr->rq_private_buf.buflen) > repsize) - copied = repsize; - - /* Suck it into the iovec, verify checksum if not done by hw. */ - if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) - goto out_unlock; - - /* Something worked... */ - dst_confirm(skb->dst); - - xprt_complete_rqst(xprt, rovr, copied); - - out_unlock: - spin_unlock(&xprt->sock_lock); - dropit: - skb_free_datagram(sk, skb); - out: - read_unlock(&sk->sk_callback_lock); -} - -/* - * Copy from an skb into memory and shrink the skb. - */ -static inline size_t -tcp_copy_data(skb_reader_t *desc, void *p, size_t len) -{ - if (len > desc->count) - len = desc->count; - if (skb_copy_bits(desc->skb, desc->offset, p, len)) { - dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", - len, desc->count); - return 0; - } - desc->offset += len; - desc->count -= len; - dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", - len, desc->count); - return len; -} - -/* - * TCP read fragment marker - */ -static inline void -tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len, used; - char *p; - - p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; - len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); - xprt->tcp_offset += used; - if (used != len) - return; - xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) - xprt->tcp_flags |= XPRT_LAST_FRAG; - else - xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; - xprt->tcp_flags &= ~XPRT_COPY_RECM; - xprt->tcp_offset = 0; - /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { - printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); - xprt_disconnect(xprt); - } - dprintk("RPC: reading TCP record fragment of length %d\n", - xprt->tcp_reclen); -} - -static void -tcp_check_recm(struct rpc_xprt *xprt) -{ - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); - if (xprt->tcp_offset == xprt->tcp_reclen) { - xprt->tcp_flags |= XPRT_COPY_RECM; - xprt->tcp_offset = 0; - if (xprt->tcp_flags & XPRT_LAST_FRAG) { - xprt->tcp_flags &= ~XPRT_COPY_DATA; - xprt->tcp_flags |= XPRT_COPY_XID; - xprt->tcp_copied = 0; - } - } -} - -/* - * TCP read xid - */ -static inline void -tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len, used; - char *p; - - len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; - dprintk("RPC: reading XID (%Zu bytes)\n", len); - p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); - xprt->tcp_offset += used; - if (used != len) - return; - xprt->tcp_flags &= ~XPRT_COPY_XID; - xprt->tcp_flags |= XPRT_COPY_DATA; - xprt->tcp_copied = 4; - dprintk("RPC: reading reply for XID %08x\n", - ntohl(xprt->tcp_xid)); - tcp_check_recm(xprt); -} - -/* - * TCP read and complete request - */ -static inline void -tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - struct rpc_rqst *req; - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->sock_lock); - req = xprt_lookup_rqst(xprt, xprt->tcp_xid); - if (!req) { - xprt->tcp_flags &= ~XPRT_COPY_DATA; - dprintk("RPC: XID %08x request not found!\n", - ntohl(xprt->tcp_xid)); - spin_unlock(&xprt->sock_lock); - return; - } - - rcvbuf = &req->rq_private_buf; - len = desc->count; - if (len > xprt->tcp_reclen - xprt->tcp_offset) { - skb_reader_t my_desc; - - len = xprt->tcp_reclen - xprt->tcp_offset; - memcpy(&my_desc, desc, sizeof(my_desc)); - my_desc.count = len; - r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - &my_desc, tcp_copy_data); - desc->count -= r; - desc->offset += r; - } else - r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - desc, tcp_copy_data); - - if (r > 0) { - xprt->tcp_copied += r; - xprt->tcp_offset += r; - } - if (r != len) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off XPRT_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - xprt->tcp_flags &= ~XPRT_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(xprt->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); - goto out; - } - - dprintk("RPC: XID %08x read %Zd bytes\n", - ntohl(xprt->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); - - if (xprt->tcp_copied == req->rq_private_buf.buflen) - xprt->tcp_flags &= ~XPRT_COPY_DATA; - else if (xprt->tcp_offset == xprt->tcp_reclen) { - if (xprt->tcp_flags & XPRT_LAST_FRAG) - xprt->tcp_flags &= ~XPRT_COPY_DATA; - } - -out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } - spin_unlock(&xprt->sock_lock); - tcp_check_recm(xprt); -} - -/* - * TCP discard extra bytes from a short read - */ -static inline void -tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len; - - len = xprt->tcp_reclen - xprt->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - xprt->tcp_offset += len; - dprintk("RPC: discarded %Zu bytes\n", len); - tcp_check_recm(xprt); -} - -/* - * TCP record receive routine - * We first have to grab the record marker, then the XID, then the data. - */ -static int -tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - skb_reader_t desc = { - .skb = skb, - .offset = offset, - .count = len, - .csum = 0 - }; - - dprintk("RPC: tcp_data_recv\n"); - do { - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (xprt->tcp_flags & XPRT_COPY_RECM) { - tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (xprt->tcp_flags & XPRT_COPY_XID) { - tcp_read_xid(xprt, &desc); - continue; - } - /* Read in the request data */ - if (xprt->tcp_flags & XPRT_COPY_DATA) { - tcp_read_request(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - tcp_read_discard(xprt, &desc); - } while (desc.count); - dprintk("RPC: tcp_data_recv done\n"); - return len - desc.count; -} - -static void tcp_data_ready(struct sock *sk, int bytes) -{ - struct rpc_xprt *xprt; - read_descriptor_t rd_desc; - - read_lock(&sk->sk_callback_lock); - dprintk("RPC: tcp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: tcp_data_ready socket info not found!\n"); - goto out; - } - if (xprt->shutdown) - goto out; - - /* We use rd_desc to pass struct xprt to tcp_data_recv */ - rd_desc.arg.data = xprt; - rd_desc.count = 65536; - tcp_read_sock(sk, &rd_desc, tcp_data_recv); -out: - read_unlock(&sk->sk_callback_lock); -} - -static void -tcp_state_change(struct sock *sk) -{ - struct rpc_xprt *xprt; - - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - dprintk("RPC: tcp_state_change client %p...\n", xprt); - dprintk("RPC: state %x conn %d dead %d zapped %d\n", - sk->sk_state, xprt_connected(xprt), - sock_flag(sk, SOCK_DEAD), - sock_flag(sk, SOCK_ZAPPED)); - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - spin_lock_bh(&xprt->sock_lock); - if (!xprt_test_and_set_connected(xprt)) { - /* Reset TCP record info */ - xprt->tcp_offset = 0; - xprt->tcp_reclen = 0; - xprt->tcp_copied = 0; - xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - rpc_wake_up(&xprt->pending); - } - spin_unlock_bh(&xprt->sock_lock); - break; - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - default: - xprt_disconnect(xprt); - break; - } - out: - read_unlock(&sk->sk_callback_lock); -} - -/* - * Called when more output buffer space is available for this socket. - * We try not to wake our writers until they can make "significant" - * progress, otherwise we'll waste resources thrashing sock_sendmsg - * with a bunch of small requests. - */ -static void -xprt_write_space(struct sock *sk) -{ - struct rpc_xprt *xprt; - struct socket *sock; - - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) - goto out; - if (xprt->shutdown) - goto out; - - /* Wait until we have enough socket memory */ - if (xprt->stream) { - /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) - goto out; - } else { - /* from net/core/sock.c:sock_def_write_space */ - if (!sock_writeable(sk)) - goto out; - } - - if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - goto out; - - spin_lock_bh(&xprt->sock_lock); - if (xprt->snd_task) - rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->sock_lock); -out: - read_unlock(&sk->sk_callback_lock); -} - /* * RPC receive timeout handler. */ @@ -1161,19 +539,10 @@ xprt_transmit(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - int status, retry = 0; - + int status; dprintk("RPC: %4d xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - /* set up everything as needed. */ - /* Write the record marker */ - if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; - - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - } - smp_rmb(); if (!req->rq_received) { if (list_empty(&req->rq_list)) { @@ -1191,41 +560,9 @@ xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; - /* Continue transmitting the packet/record. We must be careful - * to cope with writespace callbacks arriving _after_ we have - * called xprt_sendmsg(). - */ - while (1) { - req->rq_xtime = jiffies; - status = xprt_sendmsg(xprt, req); - - if (status < 0) - break; - - if (xprt->stream) { - req->rq_bytes_sent += status; - - /* If we've sent the entire packet, immediately - * reset the count of bytes sent. */ - if (req->rq_bytes_sent >= req->rq_slen) { - req->rq_bytes_sent = 0; - goto out_receive; - } - } else { - if (status >= req->rq_slen) - goto out_receive; - status = -EAGAIN; - break; - } - - dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, - req->rq_slen); - - status = -EAGAIN; - if (retry++ > 50) - break; - } + status = xprt->ops->send_request(task); + if (!status) + goto out_receive; /* Note: at this point, task->tk_sleeping has not yet been set, * hence there is no danger of the waking up task being put on @@ -1234,26 +571,10 @@ xprt_transmit(struct rpc_task *task) task->tk_status = status; switch (status) { - case -EAGAIN: - if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xprt_write_space */ - spin_lock_bh(&xprt->sock_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); - } - spin_unlock_bh(&xprt->sock_lock); - return; - } - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); - return; case -ECONNREFUSED: task->tk_timeout = RPC_REESTABLISH_TIMEOUT; rpc_sleep_on(&xprt->sending, task, NULL, NULL); + case -EAGAIN: case -ENOTCONN: return; default: @@ -1367,7 +688,8 @@ xprt_release(struct rpc_task *task) list_del(&req->rq_list); xprt->last_used = jiffies; if (list_empty(&xprt->recv) && !xprt->shutdown) - mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); + mod_timer(&xprt->timer, + xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); spin_unlock_bh(&xprt->sock_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ @@ -1380,18 +702,6 @@ xprt_release(struct rpc_task *task) spin_unlock(&xprt->xprt_lock); } -/* - * Set default timeout parameters - */ -static void -xprt_default_timeout(struct rpc_timeout *to, int proto) -{ - if (proto == IPPROTO_UDP) - xprt_set_timeout(to, 5, 5 * HZ); - else - xprt_set_timeout(to, 2, 60 * HZ); -} - /* * Set constant timeout */ @@ -1405,68 +715,51 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) to->to_exponential = 0; } -unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; -unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; - /* * Initialize an RPC client */ static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { + int result; struct rpc_xprt *xprt; - unsigned int entries; - size_t slot_table_size; struct rpc_rqst *req; - dprintk("RPC: setting up %s transport...\n", - proto == IPPROTO_UDP? "UDP" : "TCP"); - - entries = (proto == IPPROTO_TCP)? - xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries; - if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) return ERR_PTR(-ENOMEM); memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ - xprt->max_reqs = entries; - slot_table_size = entries * sizeof(xprt->slot[0]); - xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); - if (xprt->slot == NULL) { - kfree(xprt); - return ERR_PTR(-ENOMEM); - } - memset(xprt->slot, 0, slot_table_size); xprt->addr = *ap; - xprt->prot = proto; - xprt->stream = (proto == IPPROTO_TCP)? 1 : 0; - if (xprt->stream) { - xprt->cwnd = RPC_MAXCWND(xprt); - xprt->nocong = 1; - xprt->max_payload = (1U << 31) - 1; - } else { - xprt->cwnd = RPC_INITCWND; - xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + switch (proto) { + case IPPROTO_UDP: + result = xs_setup_udp(xprt, to); + break; + case IPPROTO_TCP: + result = xs_setup_tcp(xprt, to); + break; + default: + printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n", + proto); + result = -EIO; + break; } + if (result) { + kfree(xprt); + return ERR_PTR(result); + } + spin_lock_init(&xprt->sock_lock); spin_lock_init(&xprt->xprt_lock); init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; - xprt->port = XPRT_MAX_RESVPORT; - - /* Set timeout parameters */ - if (to) { - xprt->timeout = *to; - } else - xprt_default_timeout(&xprt->timeout, xprt->prot); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); @@ -1474,134 +767,17 @@ xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); /* initialize free list */ - for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--) + for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--) list_add(&req->rq_list, &xprt->free); xprt_init_xid(xprt); - /* Check whether we want to use a reserved port */ - xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); return xprt; } -/* - * Bind to a reserved port - */ -static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sockaddr_in myaddr = { - .sin_family = AF_INET, - }; - int err, port; - - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; - do { - myaddr.sin_port = htons(port); - err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, - sizeof(myaddr)); - if (err == 0) { - xprt->port = port; - return 0; - } - if (--port == 0) - port = XPRT_MAX_RESVPORT; - } while (err == -EADDRINUSE && port != xprt->port); - - printk("RPC: Can't bind to reserved port (%d).\n", -err); - return err; -} - -static void -xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (xprt->inet) - return; - - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = xprt; - xprt->old_data_ready = sk->sk_data_ready; - xprt->old_state_change = sk->sk_state_change; - xprt->old_write_space = sk->sk_write_space; - if (xprt->prot == IPPROTO_UDP) { - sk->sk_data_ready = udp_data_ready; - sk->sk_no_check = UDP_CSUM_NORCV; - xprt_set_connected(xprt); - } else { - tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = tcp_data_ready; - sk->sk_state_change = tcp_state_change; - xprt_clear_connected(xprt); - } - sk->sk_write_space = xprt_write_space; - - /* Reset to new socket */ - xprt->sock = sock; - xprt->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); - - return; -} - -/* - * Set socket buffer length - */ -void -xprt_sock_setbufsize(struct rpc_xprt *xprt) -{ - struct sock *sk = xprt->inet; - - if (xprt->stream) - return; - if (xprt->rcvsize) { - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; - } - if (xprt->sndsize) { - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; - sk->sk_write_space(sk); - } -} - -/* - * Datastream sockets are created here, but xprt_connect will create - * and connect stream sockets. - */ -static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) -{ - struct socket *sock; - int type, err; - - dprintk("RPC: xprt_create_socket(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); - - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - printk("RPC: can't create socket (%d).\n", -err); - return NULL; - } - - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xprt_bindresvport(xprt, sock) < 0) { - printk("RPC: can't bind to reserved port.\n"); - goto failed; - } - - return sock; - -failed: - sock_release(sock); - return NULL; -} - /* * Create an RPC client transport given the protocol and peer address. */ @@ -1631,10 +807,6 @@ xprt_shutdown(struct rpc_xprt *xprt) rpc_wake_up(&xprt->backlog); wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); - - /* synchronously wait for connect worker to finish */ - cancel_delayed_work(&xprt->sock_connect); - flush_scheduled_work(); } /* @@ -1655,9 +827,7 @@ xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); xprt_shutdown(xprt); - xprt_disconnect(xprt); - xprt_close(xprt); - kfree(xprt->slot); + xprt->ops->destroy(xprt); kfree(xprt); return 0; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c new file mode 100644 index 000000000000..fa1180ac4823 --- /dev/null +++ b/net/sunrpc/xprtsock.c @@ -0,0 +1,1021 @@ +/* + * linux/net/sunrpc/xprtsock.c + * + * Client-side transport implementation for sockets. + * + * TCP callback races fixes (C) 1998 Red Hat Software + * TCP send fixes (C) 1998 Red Hat Software + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland + * + * Rewrite of larges part of the code in order to stabilize TCP stuff. + * Fix behaviour when socket buffer is full. + * (C) 1999 Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# undef RPC_DEBUG_DATA +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +#define XPRT_MAX_RESVPORT (800) + +#ifdef RPC_DEBUG_DATA +/* + * Print the buffer contents (first 128 bytes only--just enough for + * diropres return). + */ +static void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + u8 *buf = (u8 *) packet; + int j; + + dprintk("RPC: %s\n", msg); + for (j = 0; j < count && j < 128; j += 4) { + if (!(j & 31)) { + if (j) + dprintk("\n"); + dprintk("0x%04x ", j); + } + dprintk("%02x%02x%02x%02x ", + buf[j], buf[j+1], buf[j+2], buf[j+3]); + } + dprintk("\n"); +} +#else +static inline void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + /* NOP */ +} +#endif + +/* + * Look up RPC transport given an INET socket + */ +static inline struct rpc_xprt * +xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +static int +xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, + struct xdr_buf *xdr, unsigned int base, int msgflags) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int err, ret = 0; + ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + + len = xdr->head[0].iov_len; + if (base < len || (addr != NULL && base == 0)) { + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = msgflags, + }; + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (iov.iov_len != 0) + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + else + err = kernel_sendmsg(sock, &msg, NULL, 0, 0); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != iov.iov_len) + goto out; + base = 0; + } else + base -= len; + + if (pglen == 0) + goto copy_tail; + if (base >= pglen) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + + sendpage = sock->ops->sendpage ? : sock_no_sendpage; + do { + int flags = msgflags; + + len = PAGE_CACHE_SIZE; + if (base) + len -= base; + if (pglen < len) + len = pglen; + + if (pglen != len || xdr->tail[0].iov_len != 0) + flags |= MSG_MORE; + + /* Hmm... We might be dealing with highmem pages */ + if (PageHighMem(*ppage)) + sendpage = sock_no_sendpage; + err = sendpage(sock, *ppage, base, len, flags); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != len) + goto out; + base = 0; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) { + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = msgflags, + }; + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + } +out: + return ret; +} + +/* + * Write data to socket. + */ +static inline int +xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + struct socket *sock = xprt->sock; + struct xdr_buf *xdr = &req->rq_snd_buf; + struct sockaddr *addr = NULL; + int addrlen = 0; + unsigned int skip; + int result; + + if (!sock) + return -ENOTCONN; + + xprt_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + /* For UDP, we need to provide an address */ + if (!xprt->stream) { + addr = (struct sockaddr *) &xprt->addr; + addrlen = sizeof(xprt->addr); + } + /* Dont repeat bytes */ + skip = req->rq_bytes_sent; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + + dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); + + if (result >= 0) + return result; + + switch (result) { + case -ECONNREFUSED: + /* When the server has died, an ICMP port unreachable message + * prompts ECONNREFUSED. + */ + case -EAGAIN: + break; + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + /* connection broken */ + if (xprt->stream) + result = -ENOTCONN; + break; + default: + printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); + } + return result; +} + +static int +xprt_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status, retry = 0; + + /* set up everything as needed. */ + /* Write the record marker */ + if (xprt->stream) { + u32 *marker = req->rq_svec[0].iov_base; + + *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + } + + /* Continue transmitting the packet/record. We must be careful + * to cope with writespace callbacks arriving _after_ we have + * called xprt_sendmsg(). + */ + while (1) { + req->rq_xtime = jiffies; + status = xprt_sendmsg(xprt, req); + + if (status < 0) + break; + + if (xprt->stream) { + req->rq_bytes_sent += status; + + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + if (req->rq_bytes_sent >= req->rq_slen) { + req->rq_bytes_sent = 0; + return 0; + } + } else { + if (status >= req->rq_slen) + return 0; + status = -EAGAIN; + break; + } + + dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + status = -EAGAIN; + if (retry++ > 50) + break; + } + + if (status == -EAGAIN) { + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with xprt_write_space */ + spin_lock_bh(&xprt->sock_lock); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); + } + spin_unlock_bh(&xprt->sock_lock); + return status; + } + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); + } + return status; +} + +/* + * Close down a transport socket + */ +static void +xprt_close(struct rpc_xprt *xprt) +{ + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; + + if (!sk) + return; + + write_lock_bh(&sk->sk_callback_lock); + xprt->inet = NULL; + xprt->sock = NULL; + + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; + sk->sk_state_change = xprt->old_state_change; + sk->sk_write_space = xprt->old_write_space; + write_unlock_bh(&sk->sk_callback_lock); + + sk->sk_no_check = 0; + + sock_release(sock); +} + +static void xprt_socket_destroy(struct rpc_xprt *xprt) +{ + cancel_delayed_work(&xprt->sock_connect); + flush_scheduled_work(); + + xprt_disconnect(xprt); + xprt_close(xprt); + kfree(xprt->slot); +} + +/* + * Input handler for RPC replies. Called from a bottom half and hence + * atomic. + */ +static void +udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid, *xp; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: udp_data_ready request not found!\n"); + goto out; + } + + dprintk("RPC: udp_data_ready client %p\n", xprt); + + if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(struct udphdr); + if (repsize < 4) { + printk("RPC: impossible RPC reply size %d!\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->sock_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + dprintk("RPC: %4d received reply\n", task->tk_pid); + + if ((copied = rovr->rq_private_buf.buflen) > repsize) + copied = repsize; + + /* Suck it into the iovec, verify checksum if not done by hw. */ + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) + goto out_unlock; + + /* Something worked... */ + dst_confirm(skb->dst); + + xprt_complete_rqst(xprt, rovr, copied); + + out_unlock: + spin_unlock(&xprt->sock_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Copy from an skb into memory and shrink the skb. + */ +static inline size_t +tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, p, len)) { + dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", + len, desc->count); + return 0; + } + desc->offset += len; + desc->count -= len; + dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", + len, desc->count); + return len; +} + +/* + * TCP read fragment marker + */ +static inline void +tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; + len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); + if (xprt->tcp_reclen & 0x80000000) + xprt->tcp_flags |= XPRT_LAST_FRAG; + else + xprt->tcp_flags &= ~XPRT_LAST_FRAG; + xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_flags &= ~XPRT_COPY_RECM; + xprt->tcp_offset = 0; + /* Sanity check of the record length */ + if (xprt->tcp_reclen < 4) { + printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); + xprt_disconnect(xprt); + } + dprintk("RPC: reading TCP record fragment of length %d\n", + xprt->tcp_reclen); +} + +static void +tcp_check_recm(struct rpc_xprt *xprt) +{ + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); + if (xprt->tcp_offset == xprt->tcp_reclen) { + xprt->tcp_flags |= XPRT_COPY_RECM; + xprt->tcp_offset = 0; + if (xprt->tcp_flags & XPRT_LAST_FRAG) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + xprt->tcp_flags |= XPRT_COPY_XID; + xprt->tcp_copied = 0; + } + } +} + +/* + * TCP read xid + */ +static inline void +tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; + dprintk("RPC: reading XID (%Zu bytes)\n", len); + p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_flags &= ~XPRT_COPY_XID; + xprt->tcp_flags |= XPRT_COPY_DATA; + xprt->tcp_copied = 4; + dprintk("RPC: reading reply for XID %08x\n", + ntohl(xprt->tcp_xid)); + tcp_check_recm(xprt); +} + +/* + * TCP read and complete request + */ +static inline void +tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + struct rpc_rqst *req; + struct xdr_buf *rcvbuf; + size_t len; + ssize_t r; + + /* Find and lock the request corresponding to this xid */ + spin_lock(&xprt->sock_lock); + req = xprt_lookup_rqst(xprt, xprt->tcp_xid); + if (!req) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x request not found!\n", + ntohl(xprt->tcp_xid)); + spin_unlock(&xprt->sock_lock); + return; + } + + rcvbuf = &req->rq_private_buf; + len = desc->count; + if (len > xprt->tcp_reclen - xprt->tcp_offset) { + skb_reader_t my_desc; + + len = xprt->tcp_reclen - xprt->tcp_offset; + memcpy(&my_desc, desc, sizeof(my_desc)); + my_desc.count = len; + r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + &my_desc, tcp_copy_data); + desc->count -= r; + desc->offset += r; + } else + r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + desc, tcp_copy_data); + + if (r > 0) { + xprt->tcp_copied += r; + xprt->tcp_offset += r; + } + if (r != len) { + /* Error when copying to the receive buffer, + * usually because we weren't able to allocate + * additional buffer pages. All we can do now + * is turn off XPRT_COPY_DATA, so the request + * will not receive any additional updates, + * and time out. + * Any remaining data from this record will + * be discarded. + */ + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x truncated request\n", + ntohl(xprt->tcp_xid)); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); + goto out; + } + + dprintk("RPC: XID %08x read %Zd bytes\n", + ntohl(xprt->tcp_xid), r); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); + + if (xprt->tcp_copied == req->rq_private_buf.buflen) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + else if (xprt->tcp_offset == xprt->tcp_reclen) { + if (xprt->tcp_flags & XPRT_LAST_FRAG) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + } + +out: + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { + dprintk("RPC: %4d received reply complete\n", + req->rq_task->tk_pid); + xprt_complete_rqst(xprt, req, xprt->tcp_copied); + } + spin_unlock(&xprt->sock_lock); + tcp_check_recm(xprt); +} + +/* + * TCP discard extra bytes from a short read + */ +static inline void +tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len; + + len = xprt->tcp_reclen - xprt->tcp_offset; + if (len > desc->count) + len = desc->count; + desc->count -= len; + desc->offset += len; + xprt->tcp_offset += len; + dprintk("RPC: discarded %Zu bytes\n", len); + tcp_check_recm(xprt); +} + +/* + * TCP record receive routine + * We first have to grab the record marker, then the XID, then the data. + */ +static int +tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rpc_xprt *xprt = rd_desc->arg.data; + skb_reader_t desc = { + .skb = skb, + .offset = offset, + .count = len, + .csum = 0 + }; + + dprintk("RPC: tcp_data_recv\n"); + do { + /* Read in a new fragment marker if necessary */ + /* Can we ever really expect to get completely empty fragments? */ + if (xprt->tcp_flags & XPRT_COPY_RECM) { + tcp_read_fraghdr(xprt, &desc); + continue; + } + /* Read in the xid if necessary */ + if (xprt->tcp_flags & XPRT_COPY_XID) { + tcp_read_xid(xprt, &desc); + continue; + } + /* Read in the request data */ + if (xprt->tcp_flags & XPRT_COPY_DATA) { + tcp_read_request(xprt, &desc); + continue; + } + /* Skip over any trailing bytes on short reads */ + tcp_read_discard(xprt, &desc); + } while (desc.count); + dprintk("RPC: tcp_data_recv done\n"); + return len - desc.count; +} + +static void tcp_data_ready(struct sock *sk, int bytes) +{ + struct rpc_xprt *xprt; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: tcp_data_ready socket info not found!\n"); + goto out; + } + if (xprt->shutdown) + goto out; + + /* We use rd_desc to pass struct xprt to tcp_data_recv */ + rd_desc.arg.data = xprt; + rd_desc.count = 65536; + tcp_read_sock(sk, &rd_desc, tcp_data_recv); +out: + read_unlock(&sk->sk_callback_lock); +} + +static void +tcp_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: tcp_state_change client %p...\n", xprt); + dprintk("RPC: state %x conn %d dead %d zapped %d\n", + sk->sk_state, xprt_connected(xprt), + sock_flag(sk, SOCK_DEAD), + sock_flag(sk, SOCK_ZAPPED)); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + spin_lock_bh(&xprt->sock_lock); + if (!xprt_test_and_set_connected(xprt)) { + /* Reset TCP record info */ + xprt->tcp_offset = 0; + xprt->tcp_reclen = 0; + xprt->tcp_copied = 0; + xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + rpc_wake_up(&xprt->pending); + } + spin_unlock_bh(&xprt->sock_lock); + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + default: + xprt_disconnect(xprt); + break; + } + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing sock_sendmsg + * with a bunch of small requests. + */ +static void +xprt_write_space(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct socket *sock; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) + goto out; + if (xprt->shutdown) + goto out; + + /* Wait until we have enough socket memory */ + if (xprt->stream) { + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + goto out; + } else { + /* from net/core/sock.c:sock_def_write_space */ + if (!sock_writeable(sk)) + goto out; + } + + if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) + goto out; + + spin_lock_bh(&xprt->sock_lock); + if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); +out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Set socket buffer length + */ +static void +xprt_sock_setbufsize(struct rpc_xprt *xprt) +{ + struct sock *sk = xprt->inet; + + if (xprt->stream) + return; + if (xprt->rcvsize) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; + } + if (xprt->sndsize) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; + sk->sk_write_space(sk); + } +} + +/* + * Bind to a reserved port + */ +static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + int err, port; + + /* Were we already bound to a given port? Try to reuse it */ + port = xprt->port; + do { + myaddr.sin_port = htons(port); + err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (err == 0) { + xprt->port = port; + return 0; + } + if (--port == 0) + port = XPRT_MAX_RESVPORT; + } while (err == -EADDRINUSE && port != xprt->port); + + printk("RPC: Can't bind to reserved port (%d).\n", -err); + return err; +} + +static void +xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (xprt->inet) + return; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + if (xprt->prot == IPPROTO_UDP) { + sk->sk_data_ready = udp_data_ready; + sk->sk_no_check = UDP_CSUM_NORCV; + xprt_set_connected(xprt); + } else { + tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ + sk->sk_data_ready = tcp_data_ready; + sk->sk_state_change = tcp_state_change; + xprt_clear_connected(xprt); + } + sk->sk_write_space = xprt_write_space; + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + write_unlock_bh(&sk->sk_callback_lock); + + return; +} + +/* + * Datastream sockets are created here, but xprt_connect will create + * and connect stream sockets. + */ +static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) +{ + struct socket *sock; + int type, err; + + dprintk("RPC: xprt_create_socket(%s %d)\n", + (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + + type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { + printk("RPC: can't create socket (%d).\n", -err); + return NULL; + } + + /* If the caller has the capability, bind to a reserved port */ + if (resvport && xprt_bindresvport(xprt, sock) < 0) { + printk("RPC: can't bind to reserved port.\n"); + goto failed; + } + + return sock; + +failed: + sock_release(sock); + return NULL; +} + +static void xprt_socket_connect(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; + + /* + * Start by resetting any existing state + */ + xprt_close(xprt); + sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + if (sock == NULL) { + /* couldn't create socket or bind to reserved port; + * this is likely a permanent error, so cause an abort */ + goto out; + } + xprt_bind_socket(xprt, sock); + xprt_sock_setbufsize(xprt); + + status = 0; + if (!xprt->stream) + goto out; + + /* + * Tell the socket layer to start connecting... + */ + status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + } + } +out: + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +out_clear: + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTING, &xprt->sockstate); + smp_mb__after_clear_bit(); +} + +static void +xprt_connect_sock(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + /* Note: if we are here due to a dropped connection + * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ + * seconds + */ + if (xprt->sock != NULL) + schedule_delayed_work(&xprt->sock_connect, + RPC_REESTABLISH_TIMEOUT); + else { + schedule_work(&xprt->sock_connect); + /* flush_scheduled_work can sleep... */ + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); + } + } +} + +/* + * Set default timeout parameters + */ +static void +xprt_default_timeout(struct rpc_timeout *to, int proto) +{ + if (proto == IPPROTO_UDP) + xprt_set_timeout(to, 5, 5 * HZ); + else + xprt_set_timeout(to, 2, 60 * HZ); +} + +static struct rpc_xprt_ops xprt_socket_ops = { + .set_buffer_size = xprt_sock_setbufsize, + .connect = xprt_connect_sock, + .send_request = xprt_send_request, + .close = xprt_close, + .destroy = xprt_socket_destroy, +}; + +extern unsigned int xprt_udp_slot_table_entries; +extern unsigned int xprt_tcp_slot_table_entries; + +int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) +{ + size_t slot_table_size; + + dprintk("RPC: setting up udp-ipv4 transport...\n"); + + xprt->max_reqs = xprt_udp_slot_table_entries; + slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) + return -ENOMEM; + memset(xprt->slot, 0, slot_table_size); + + xprt->prot = IPPROTO_UDP; + xprt->port = XPRT_MAX_RESVPORT; + xprt->stream = 0; + xprt->nocong = 0; + xprt->cwnd = RPC_INITCWND; + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + /* XXX: header size can vary due to auth type, IPv6, etc. */ + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + + xprt->ops = &xprt_socket_ops; + + if (to) + xprt->timeout = *to; + else + xprt_default_timeout(to, xprt->prot); + + return 0; +} + +int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) +{ + size_t slot_table_size; + + dprintk("RPC: setting up tcp-ipv4 transport...\n"); + + xprt->max_reqs = xprt_tcp_slot_table_entries; + slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) + return -ENOMEM; + memset(xprt->slot, 0, slot_table_size); + + xprt->prot = IPPROTO_TCP; + xprt->port = XPRT_MAX_RESVPORT; + xprt->stream = 1; + xprt->nocong = 1; + xprt->cwnd = RPC_MAXCWND(xprt); + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + xprt->max_payload = (1U << 31) - 1; + + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + + xprt->ops = &xprt_socket_ops; + + if (to) + xprt->timeout = *to; + else + xprt_default_timeout(to, xprt->prot); + + return 0; +} From 9903cd1c27a1f30e8efea75e125be3b2002f7cb9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:26 -0400 Subject: [PATCH 010/126] [PATCH] RPC: transport switch function naming Introduce block header comments and a function naming convention to the socket transport implementation. Provide a debug setting for transports that is separate from RPCDBG_XPRT. Eliminate xprt_default_timeout(). Provide block comments for exposed interfaces in xprt.c, and eliminate the useless obvious comments. Convert printk's to dprintk's. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:04:04 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/debug.h | 1 + net/sunrpc/xprt.c | 147 ++++++----- net/sunrpc/xprtsock.c | 466 ++++++++++++++++++----------------- 3 files changed, 312 insertions(+), 302 deletions(-) diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index eadb31e3c198..42d299747956 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -32,6 +32,7 @@ #define RPCDBG_AUTH 0x0010 #define RPCDBG_PMAP 0x0020 #define RPCDBG_SCHED 0x0040 +#define RPCDBG_TRANS 0x0080 #define RPCDBG_SVCSOCK 0x0100 #define RPCDBG_SVCDSP 0x0200 #define RPCDBG_MISC 0x0400 diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4342acf4d1cd..589195e630ef 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -227,9 +227,6 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) xprt->cwnd = cwnd; } -/* - * Reset the major timeout value - */ static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -244,8 +241,10 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req) req->rq_majortimeo += jiffies; } -/* - * Adjust timeout values etc for next retransmit +/** + * xprt_adjust_timeout - adjust timeout values for next retransmit + * @req: RPC request containing parameters to use for the adjustment + * */ int xprt_adjust_timeout(struct rpc_rqst *req) { @@ -291,8 +290,10 @@ xprt_socket_autoclose(void *args) xprt_release_write(xprt, NULL); } -/* - * Mark a transport as disconnected +/** + * xprt_disconnect - mark a transport as disconnected + * @xprt: transport to flag for disconnect + * */ void xprt_disconnect(struct rpc_xprt *xprt) { @@ -303,9 +304,6 @@ void xprt_disconnect(struct rpc_xprt *xprt) spin_unlock_bh(&xprt->sock_lock); } -/* - * Used to allow disconnection when we've been idle - */ static void xprt_init_autodisconnect(unsigned long data) { @@ -327,8 +325,9 @@ xprt_init_autodisconnect(unsigned long data) spin_unlock(&xprt->sock_lock); } -/* - * Attempt to connect a TCP socket. +/** + * xprt_connect - schedule a transport connect operation + * @task: RPC task that is requesting the connect * */ void xprt_connect(struct rpc_task *task) @@ -361,11 +360,7 @@ void xprt_connect(struct rpc_task *task) return; } -/* - * We arrive here when awoken from waiting on connection establishment. - */ -static void -xprt_connect_status(struct rpc_task *task) +static void xprt_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -404,8 +399,11 @@ xprt_connect_status(struct rpc_task *task) } } -/* - * Look up the RPC request corresponding to a reply, and then lock it. +/** + * xprt_lookup_rqst - find an RPC request corresponding to an XID + * @xprt: transport on which the original request was transmitted + * @xid: RPC XID of incoming reply + * */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) { @@ -422,9 +420,12 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) return req; } -/* - * Complete reply received. - * The TCP code relies on us to remove the request from xprt->pending. +/** + * xprt_complete_rqst - called when reply processing is complete + * @xprt: controlling transport + * @req: RPC request that just completed + * @copied: actual number of bytes received from the transport + * */ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) { @@ -498,12 +499,12 @@ xprt_timer(struct rpc_task *task) spin_unlock(&xprt->sock_lock); } -/* - * Place the actual RPC call. - * We have to copy the iovec because sendmsg fiddles with its contents. +/** + * xprt_prepare_transmit - reserve the transport before sending a request + * @task: RPC task about to send a request + * */ -int -xprt_prepare_transmit(struct rpc_task *task) +int xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; @@ -533,8 +534,13 @@ xprt_prepare_transmit(struct rpc_task *task) return err; } -void -xprt_transmit(struct rpc_task *task) +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * We have to copy the iovec because sendmsg fiddles with its contents. + */ +void xprt_transmit(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; @@ -604,11 +610,7 @@ xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->sock_lock); } -/* - * Reserve an RPC call slot. - */ -static inline void -do_xprt_reserve(struct rpc_task *task) +static inline void do_xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -628,8 +630,14 @@ do_xprt_reserve(struct rpc_task *task) rpc_sleep_on(&xprt->backlog, task, NULL, NULL); } -void -xprt_reserve(struct rpc_task *task) +/** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation + * + * If no more slots are available, place the task on the transport's + * backlog queue. + */ +void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -641,9 +649,6 @@ xprt_reserve(struct rpc_task *task) } } -/* - * Allocate a 'unique' XID - */ static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) { return xprt->xid++; @@ -654,11 +659,7 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt) get_random_bytes(&xprt->xid, sizeof(xprt->xid)); } -/* - * Initialize RPC request - */ -static void -xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) { struct rpc_rqst *req = task->tk_rqstp; @@ -670,11 +671,12 @@ xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req, ntohl(req->rq_xid)); } -/* - * Release an RPC call slot +/** + * xprt_release - release an RPC request slot + * @task: task which is finished with the slot + * */ -void -xprt_release(struct rpc_task *task) +void xprt_release(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; @@ -702,11 +704,14 @@ xprt_release(struct rpc_task *task) spin_unlock(&xprt->xprt_lock); } -/* - * Set constant timeout +/** + * xprt_set_timeout - set constant RPC timeout + * @to: RPC timeout parameters to set up + * @retr: number of retries + * @incr: amount of increase after each retry + * */ -void -xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) +void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) { to->to_initval = to->to_increment = incr; @@ -715,11 +720,7 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) to->to_exponential = 0; } -/* - * Initialize an RPC client - */ -static struct rpc_xprt * -xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) +static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { int result; struct rpc_xprt *xprt; @@ -778,11 +779,14 @@ xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) return xprt; } -/* - * Create an RPC client transport given the protocol and peer address. +/** + * xprt_create_proto - create an RPC client transport + * @proto: requested transport protocol + * @sap: remote peer's address + * @to: timeout parameters for new transport + * */ -struct rpc_xprt * -xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) +struct rpc_xprt *xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) { struct rpc_xprt *xprt; @@ -794,11 +798,7 @@ xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) return xprt; } -/* - * Prepare for transport shutdown. - */ -static void -xprt_shutdown(struct rpc_xprt *xprt) +static void xprt_shutdown(struct rpc_xprt *xprt) { xprt->shutdown = 1; rpc_wake_up(&xprt->sending); @@ -809,21 +809,18 @@ xprt_shutdown(struct rpc_xprt *xprt) del_timer_sync(&xprt->timer); } -/* - * Clear the xprt backlog queue - */ -static int -xprt_clear_backlog(struct rpc_xprt *xprt) { +static int xprt_clear_backlog(struct rpc_xprt *xprt) { rpc_wake_up_next(&xprt->backlog); wake_up(&xprt->cong_wait); return 1; } -/* - * Destroy an RPC transport, killing off all requests. +/** + * xprt_destroy - destroy an RPC transport, killing off all requests. + * @xprt: transport to destroy + * */ -int -xprt_destroy(struct rpc_xprt *xprt) +int xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); xprt_shutdown(xprt); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fa1180ac4823..80222de3afa4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -33,23 +33,21 @@ #include #include +/* + * Maximum port number to use when requesting a reserved port. + */ +#define XS_MAX_RESVPORT (800U) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA -# define RPCDBG_FACILITY RPCDBG_XPRT +# define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define XPRT_MAX_RESVPORT (800) - #ifdef RPC_DEBUG_DATA -/* - * Print the buffer contents (first 128 bytes only--just enough for - * diropres return). - */ -static void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) +static void xs_pktdump(char *msg, u32 *packet, unsigned int count) { - u8 *buf = (u8 *) packet; - int j; + u8 *buf = (u8 *) packet; + int j; dprintk("RPC: %s\n", msg); for (j = 0; j < count && j < 128; j += 4) { @@ -64,25 +62,22 @@ xprt_pktdump(char *msg, u32 *packet, unsigned int count) dprintk("\n"); } #else -static inline void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) +static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) { /* NOP */ } #endif -/* - * Look up RPC transport given an INET socket +/** + * xs_sendpages - write pages directly to a socket + * @sock: socket to send on + * @addr: UDP only -- address of destination + * @addrlen: UDP only -- length of destination address + * @xdr: buffer containing this request + * @base: starting position in the buffer + * */ -static inline struct rpc_xprt * -xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - -static int -xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, - struct xdr_buf *xdr, unsigned int base, int msgflags) +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, int msgflags) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -125,7 +120,7 @@ xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, } if (base || xdr->page_base) { pglen -= base; - base += xdr->page_base; + base += xdr->page_base; ppage += base >> PAGE_CACHE_SHIFT; base &= ~PAGE_CACHE_MASK; } @@ -176,23 +171,25 @@ xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, return ret; } -/* - * Write data to socket. +/** + * xs_sendmsg - write an RPC request to a socket + * @xprt: generic transport + * @req: the RPC request to write + * */ -static inline int -xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) { - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; + struct socket *sock = xprt->sock; + struct xdr_buf *xdr = &req->rq_snd_buf; struct sockaddr *addr = NULL; int addrlen = 0; - unsigned int skip; - int result; + unsigned int skip; + int result; if (!sock) return -ENOTCONN; - xprt_pktdump("packet data:", + xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); @@ -201,13 +198,13 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) addr = (struct sockaddr *) &xprt->addr; addrlen = sizeof(xprt->addr); } - /* Dont repeat bytes */ + /* Don't repeat bytes */ skip = req->rq_bytes_sent; clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + result = xs_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); - dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); + dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); if (result >= 0) return result; @@ -215,8 +212,7 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) switch (result) { case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message - * prompts ECONNREFUSED. - */ + * prompts ECONNREFUSED. */ case -EAGAIN: break; case -ECONNRESET: @@ -227,13 +223,25 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) result = -ENOTCONN; break; default: - printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); + break; } return result; } -static int -xprt_send_request(struct rpc_task *task) +/** + * xs_send_request - write an RPC request to a socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * other: Some other error occured, the request was not sent + * + * XXX: In the case of soft timeouts, should we eventually give up + * if the socket is not able to make progress? + */ +static int xs_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; @@ -242,18 +250,18 @@ xprt_send_request(struct rpc_task *task) /* set up everything as needed. */ /* Write the record marker */ if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; + u32 *marker = req->rq_svec[0].iov_base; *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); } /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have - * called xprt_sendmsg(). + * called sendmsg(). */ while (1) { req->rq_xtime = jiffies; - status = xprt_sendmsg(xprt, req); + status = xs_sendmsg(xprt, req); if (status < 0) break; @@ -285,7 +293,7 @@ xprt_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xprt_write_space */ + /* Protect against races with xs_write_space */ spin_lock_bh(&xprt->sock_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) @@ -303,65 +311,77 @@ xprt_send_request(struct rpc_task *task) return status; } -/* - * Close down a transport socket +/** + * xs_close - close a socket + * @xprt: transport + * */ -static void -xprt_close(struct rpc_xprt *xprt) +static void xs_close(struct rpc_xprt *xprt) { - struct socket *sock = xprt->sock; - struct sock *sk = xprt->inet; + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; if (!sk) return; + dprintk("RPC: xs_close xprt %p\n", xprt); + write_lock_bh(&sk->sk_callback_lock); xprt->inet = NULL; xprt->sock = NULL; - sk->sk_user_data = NULL; - sk->sk_data_ready = xprt->old_data_ready; + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; sk->sk_state_change = xprt->old_state_change; - sk->sk_write_space = xprt->old_write_space; + sk->sk_write_space = xprt->old_write_space; write_unlock_bh(&sk->sk_callback_lock); - sk->sk_no_check = 0; + sk->sk_no_check = 0; sock_release(sock); } -static void xprt_socket_destroy(struct rpc_xprt *xprt) +/** + * xs_destroy - prepare to shutdown a transport + * @xprt: doomed transport + * + */ +static void xs_destroy(struct rpc_xprt *xprt) { + dprintk("RPC: xs_destroy xprt %p\n", xprt); + cancel_delayed_work(&xprt->sock_connect); flush_scheduled_work(); xprt_disconnect(xprt); - xprt_close(xprt); + xs_close(xprt); kfree(xprt->slot); } -/* - * Input handler for RPC replies. Called from a bottom half and hence - * atomic. - */ -static void -udp_data_ready(struct sock *sk, int len) +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) { - struct rpc_task *task; - struct rpc_xprt *xprt; + return (struct rpc_xprt *) sk->sk_user_data; +} + +/** + * xs_udp_data_ready - "data ready" callback for UDP sockets + * @sk: socket with data to read + * @len: how much data to read + * + */ +static void xs_udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; struct rpc_rqst *rovr; - struct sk_buff *skb; + struct sk_buff *skb; int err, repsize, copied; u32 _xid, *xp; read_lock(&sk->sk_callback_lock); - dprintk("RPC: udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: udp_data_ready request not found!\n"); + dprintk("RPC: xs_udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) goto out; - } - - dprintk("RPC: udp_data_ready client %p\n", xprt); if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) goto out; @@ -371,7 +391,7 @@ udp_data_ready(struct sock *sk, int len) repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { - printk("RPC: impossible RPC reply size %d!\n", repsize); + dprintk("RPC: impossible RPC reply size %d!\n", repsize); goto dropit; } @@ -410,11 +430,7 @@ udp_data_ready(struct sock *sk, int len) read_unlock(&sk->sk_callback_lock); } -/* - * Copy from an skb into memory and shrink the skb. - */ -static inline size_t -tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len) { if (len > desc->count) len = desc->count; @@ -430,18 +446,14 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len) return len; } -/* - * TCP read fragment marker - */ -static inline void -tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) { size_t len, used; char *p; p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); + used = xs_tcp_copy_data(desc, p, len); xprt->tcp_offset += used; if (used != len) return; @@ -455,15 +467,15 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) xprt->tcp_offset = 0; /* Sanity check of the record length */ if (xprt->tcp_reclen < 4) { - printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); + dprintk("RPC: invalid TCP record fragment length\n"); xprt_disconnect(xprt); + return; } dprintk("RPC: reading TCP record fragment of length %d\n", xprt->tcp_reclen); } -static void -tcp_check_recm(struct rpc_xprt *xprt) +static void xs_tcp_check_recm(struct rpc_xprt *xprt) { dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); @@ -478,11 +490,7 @@ tcp_check_recm(struct rpc_xprt *xprt) } } -/* - * TCP read xid - */ -static inline void -tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) { size_t len, used; char *p; @@ -490,7 +498,7 @@ tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; dprintk("RPC: reading XID (%Zu bytes)\n", len); p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); + used = xs_tcp_copy_data(desc, p, len); xprt->tcp_offset += used; if (used != len) return; @@ -499,14 +507,10 @@ tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) xprt->tcp_copied = 4; dprintk("RPC: reading reply for XID %08x\n", ntohl(xprt->tcp_xid)); - tcp_check_recm(xprt); + xs_tcp_check_recm(xprt); } -/* - * TCP read and complete request - */ -static inline void -tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) { struct rpc_rqst *req; struct xdr_buf *rcvbuf; @@ -533,12 +537,12 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) memcpy(&my_desc, desc, sizeof(my_desc)); my_desc.count = len; r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - &my_desc, tcp_copy_data); + &my_desc, xs_tcp_copy_data); desc->count -= r; desc->offset += r; } else r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - desc, tcp_copy_data); + desc, xs_tcp_copy_data); if (r > 0) { xprt->tcp_copied += r; @@ -581,14 +585,10 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) xprt_complete_rqst(xprt, req, xprt->tcp_copied); } spin_unlock(&xprt->sock_lock); - tcp_check_recm(xprt); + xs_tcp_check_recm(xprt); } -/* - * TCP discard extra bytes from a short read - */ -static inline void -tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) { size_t len; @@ -599,16 +599,10 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) desc->offset += len; xprt->tcp_offset += len; dprintk("RPC: discarded %Zu bytes\n", len); - tcp_check_recm(xprt); + xs_tcp_check_recm(xprt); } -/* - * TCP record receive routine - * We first have to grab the record marker, then the XID, then the data. - */ -static int -tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) +static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct rpc_xprt *xprt = rd_desc->arg.data; skb_reader_t desc = { @@ -616,64 +610,72 @@ tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, .offset = offset, .count = len, .csum = 0 - }; + }; - dprintk("RPC: tcp_data_recv\n"); + dprintk("RPC: xs_tcp_data_recv started\n"); do { /* Read in a new fragment marker if necessary */ /* Can we ever really expect to get completely empty fragments? */ if (xprt->tcp_flags & XPRT_COPY_RECM) { - tcp_read_fraghdr(xprt, &desc); + xs_tcp_read_fraghdr(xprt, &desc); continue; } /* Read in the xid if necessary */ if (xprt->tcp_flags & XPRT_COPY_XID) { - tcp_read_xid(xprt, &desc); + xs_tcp_read_xid(xprt, &desc); continue; } /* Read in the request data */ if (xprt->tcp_flags & XPRT_COPY_DATA) { - tcp_read_request(xprt, &desc); + xs_tcp_read_request(xprt, &desc); continue; } /* Skip over any trailing bytes on short reads */ - tcp_read_discard(xprt, &desc); + xs_tcp_read_discard(xprt, &desc); } while (desc.count); - dprintk("RPC: tcp_data_recv done\n"); + dprintk("RPC: xs_tcp_data_recv done\n"); return len - desc.count; } -static void tcp_data_ready(struct sock *sk, int bytes) +/** + * xs_tcp_data_ready - "data ready" callback for TCP sockets + * @sk: socket with data to read + * @bytes: how much data to read + * + */ +static void xs_tcp_data_ready(struct sock *sk, int bytes) { struct rpc_xprt *xprt; read_descriptor_t rd_desc; read_lock(&sk->sk_callback_lock); - dprintk("RPC: tcp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: tcp_data_ready socket info not found!\n"); + dprintk("RPC: xs_tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) goto out; - } if (xprt->shutdown) goto out; - /* We use rd_desc to pass struct xprt to tcp_data_recv */ + /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ rd_desc.arg.data = xprt; rd_desc.count = 65536; - tcp_read_sock(sk, &rd_desc, tcp_data_recv); + tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); out: read_unlock(&sk->sk_callback_lock); } -static void -tcp_state_change(struct sock *sk) +/** + * xs_tcp_state_change - callback to handle TCP socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_tcp_state_change(struct sock *sk) { - struct rpc_xprt *xprt; + struct rpc_xprt *xprt; read_lock(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; - dprintk("RPC: tcp_state_change client %p...\n", xprt); + dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); dprintk("RPC: state %x conn %d dead %d zapped %d\n", sk->sk_state, xprt_connected(xprt), sock_flag(sk, SOCK_DEAD), @@ -703,17 +705,20 @@ tcp_state_change(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/* +/** + * xs_write_space - callback invoked when socket buffer space becomes + * available + * @sk: socket whose state has changed + * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" * progress, otherwise we'll waste resources thrashing sock_sendmsg * with a bunch of small requests. */ -static void -xprt_write_space(struct sock *sk) +static void xs_write_space(struct sock *sk) { - struct rpc_xprt *xprt; - struct socket *sock; + struct rpc_xprt *xprt; + struct socket *sock; read_lock(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) @@ -743,11 +748,15 @@ xprt_write_space(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/* - * Set socket buffer length +/** + * xs_set_buffer_size - set send and receive limits + * @xprt: generic transport + * + * Set socket send and receive limits based on the + * sndsize and rcvsize fields in the generic transport + * structure. This applies only to UDP sockets. */ -static void -xprt_sock_setbufsize(struct rpc_xprt *xprt) +static void xs_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; @@ -764,15 +773,12 @@ xprt_sock_setbufsize(struct rpc_xprt *xprt) } } -/* - * Bind to a reserved port - */ -static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { .sin_family = AF_INET, }; - int err, port; + int err, port; /* Were we already bound to a given port? Try to reuse it */ port = xprt->port; @@ -782,20 +788,47 @@ static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) sizeof(myaddr)); if (err == 0) { xprt->port = port; + dprintk("RPC: xs_bindresvport bound to port %u\n", + port); return 0; } if (--port == 0) - port = XPRT_MAX_RESVPORT; + port = XS_MAX_RESVPORT; } while (err == -EADDRINUSE && port != xprt->port); - printk("RPC: Can't bind to reserved port (%d).\n", -err); + dprintk("RPC: can't bind to reserved port (%d).\n", -err); return err; } -static void -xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) +static struct socket *xs_create(struct rpc_xprt *xprt, int proto, int resvport) { - struct sock *sk = sock->sk; + struct socket *sock; + int type, err; + + dprintk("RPC: xs_create(%s %d)\n", + (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + + type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { + dprintk("RPC: can't create socket (%d).\n", -err); + return NULL; + } + + /* If the caller has the capability, bind to a reserved port */ + if (resvport && xs_bindresvport(xprt, sock) < 0) + goto failed; + + return sock; + +failed: + sock_release(sock); + return NULL; +} + +static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock *sk = sock->sk; if (xprt->inet) return; @@ -806,16 +839,16 @@ xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) xprt->old_state_change = sk->sk_state_change; xprt->old_write_space = sk->sk_write_space; if (xprt->prot == IPPROTO_UDP) { - sk->sk_data_ready = udp_data_ready; + sk->sk_data_ready = xs_udp_data_ready; sk->sk_no_check = UDP_CSUM_NORCV; xprt_set_connected(xprt); } else { tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = tcp_data_ready; - sk->sk_state_change = tcp_state_change; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; xprt_clear_connected(xprt); } - sk->sk_write_space = xprt_write_space; + sk->sk_write_space = xs_write_space; /* Reset to new socket */ xprt->sock = sock; @@ -825,39 +858,13 @@ xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) return; } -/* - * Datastream sockets are created here, but xprt_connect will create - * and connect stream sockets. +/** + * xs_connect_worker - try to connect a socket to a remote endpoint + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. */ -static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) -{ - struct socket *sock; - int type, err; - - dprintk("RPC: xprt_create_socket(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); - - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - printk("RPC: can't create socket (%d).\n", -err); - return NULL; - } - - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xprt_bindresvport(xprt, sock) < 0) { - printk("RPC: can't bind to reserved port.\n"); - goto failed; - } - - return sock; - -failed: - sock_release(sock); - return NULL; -} - -static void xprt_socket_connect(void *args) +static void xs_connect_worker(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; struct socket *sock = xprt->sock; @@ -866,18 +873,20 @@ static void xprt_socket_connect(void *args) if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; + dprintk("RPC: xs_connect_worker xprt %p\n", xprt); + /* * Start by resetting any existing state */ - xprt_close(xprt); - sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + xs_close(xprt); + sock = xs_create(xprt, xprt->prot, xprt->resvport); if (sock == NULL) { /* couldn't create socket or bind to reserved port; * this is likely a permanent error, so cause an abort */ goto out; } - xprt_bind_socket(xprt, sock); - xprt_sock_setbufsize(xprt); + xs_bind(xprt, sock); + xs_set_buffer_size(xprt); status = 0; if (!xprt->stream) @@ -908,20 +917,23 @@ static void xprt_socket_connect(void *args) smp_mb__after_clear_bit(); } -static void -xprt_connect_sock(struct rpc_task *task) +/** + * xs_connect - connect a socket to a remote endpoint + * @task: address of RPC task that manages state of connect request + * + * TCP: If the remote end dropped the connection, delay reconnecting. + */ +static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { - /* Note: if we are here due to a dropped connection - * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ - * seconds - */ - if (xprt->sock != NULL) + if (xprt->sock != NULL) { + dprintk("RPC: xs_connect delayed xprt %p\n", xprt); schedule_delayed_work(&xprt->sock_connect, RPC_REESTABLISH_TIMEOUT); - else { + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); schedule_work(&xprt->sock_connect); /* flush_scheduled_work can sleep... */ if (!RPC_IS_ASYNC(task)) @@ -930,29 +942,23 @@ xprt_connect_sock(struct rpc_task *task) } } -/* - * Set default timeout parameters - */ -static void -xprt_default_timeout(struct rpc_timeout *to, int proto) -{ - if (proto == IPPROTO_UDP) - xprt_set_timeout(to, 5, 5 * HZ); - else - xprt_set_timeout(to, 2, 60 * HZ); -} - -static struct rpc_xprt_ops xprt_socket_ops = { - .set_buffer_size = xprt_sock_setbufsize, - .connect = xprt_connect_sock, - .send_request = xprt_send_request, - .close = xprt_close, - .destroy = xprt_socket_destroy, +static struct rpc_xprt_ops xs_ops = { + .set_buffer_size = xs_set_buffer_size, + .connect = xs_connect, + .send_request = xs_send_request, + .close = xs_close, + .destroy = xs_destroy, }; extern unsigned int xprt_udp_slot_table_entries; extern unsigned int xprt_tcp_slot_table_entries; +/** + * xs_setup_udp - Set up transport to use a UDP socket + * @xprt: transport to set up + * @to: timeout parameters + * + */ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) { size_t slot_table_size; @@ -967,7 +973,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_UDP; - xprt->port = XPRT_MAX_RESVPORT; + xprt->port = XS_MAX_RESVPORT; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -975,18 +981,24 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); - xprt->ops = &xprt_socket_ops; + xprt->ops = &xs_ops; if (to) xprt->timeout = *to; else - xprt_default_timeout(to, xprt->prot); + xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); return 0; } +/** + * xs_setup_tcp - Set up transport to use a TCP socket + * @xprt: transport to set up + * @to: timeout parameters + * + */ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) { size_t slot_table_size; @@ -1001,21 +1013,21 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_TCP; - xprt->port = XPRT_MAX_RESVPORT; + xprt->port = XS_MAX_RESVPORT; xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); - xprt->ops = &xprt_socket_ops; + xprt->ops = &xs_ops; if (to) xprt->timeout = *to; else - xprt_default_timeout(to, xprt->prot); + xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); return 0; } From b4b5cc85ed4ecbe4adbfbc4df028850de67a9f09 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:29 -0400 Subject: [PATCH 011/126] [PATCH] RPC: Reduce stack utilization in xs_sendpages Reduce stack utilization of the RPC socket transport's send path. A couple of unlikely()s are added to ensure the compiler places the tail processing at the end of the csect. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Version: Thu, 11 Aug 2005 16:04:30 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 73 +++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 80222de3afa4..a5a04203a6b0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -68,6 +68,41 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) } #endif +#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len) +{ + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = XS_SENDMSG_FLAGS, + }; + + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (likely(iov.iov_len)) + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + return kernel_sendmsg(sock, &msg, NULL, 0, 0); +} + +static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len) +{ + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; + + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); +} + /** * xs_sendpages - write pages directly to a socket * @sock: socket to send on @@ -77,7 +112,7 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) * @base: starting position in the buffer * */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, int msgflags) +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -86,35 +121,20 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, len = xdr->head[0].iov_len; if (base < len || (addr != NULL && base == 0)) { - struct kvec iov = { - .iov_base = xdr->head[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = msgflags, - }; - if (xdr->len > len) - msg.msg_flags |= MSG_MORE; - - if (iov.iov_len != 0) - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - else - err = kernel_sendmsg(sock, &msg, NULL, 0, 0); + err = xs_send_head(sock, addr, addrlen, xdr, base, len); if (ret == 0) ret = err; else if (err > 0) ret += err; - if (err != iov.iov_len) + if (err != (len - base)) goto out; base = 0; } else base -= len; - if (pglen == 0) + if (unlikely(pglen == 0)) goto copy_tail; - if (base >= pglen) { + if (unlikely(base >= pglen)) { base -= pglen; goto copy_tail; } @@ -127,7 +147,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, sendpage = sock->ops->sendpage ? : sock_no_sendpage; do { - int flags = msgflags; + int flags = XS_SENDMSG_FLAGS; len = PAGE_CACHE_SIZE; if (base) @@ -154,14 +174,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, copy_tail: len = xdr->tail[0].iov_len; if (base < len) { - struct kvec iov = { - .iov_base = xdr->tail[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_flags = msgflags, - }; - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + err = xs_send_tail(sock, xdr, base, len); if (ret == 0) ret = err; else if (err > 0) @@ -202,7 +215,7 @@ static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) skip = req->rq_bytes_sent; clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xs_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + result = xs_sendpages(sock, addr, addrlen, xdr, skip); dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); From 4a0f8c04f2ece949d54a0c4fd7490259cf23a58a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:32 -0400 Subject: [PATCH 012/126] [PATCH] RPC: Rename sock_lock Clean-up: replace a name reference to sockets in the generic parts of the RPC client by renaming sock_lock in the rpc_xprt structure. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:00 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/xprt.c | 44 ++++++++++++++++++------------------- net/sunrpc/xprtsock.c | 22 +++++++++---------- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d82b47ab73cb..c4f903f0e17c 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -198,7 +198,7 @@ struct rpc_xprt { /* * Send stuff */ - spinlock_t sock_lock; /* lock socket info */ + spinlock_t transport_lock; /* lock transport info */ spinlock_t xprt_lock; /* lock xprt info */ struct rpc_task * snd_task; /* Task blocked in send */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 589195e630ef..1f0da8c1a3b0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -106,9 +106,9 @@ xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); retval = __xprt_lock_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return retval; } @@ -161,9 +161,9 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } /* @@ -266,9 +266,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req) req->rq_retries = 0; xprt_reset_majortimeo(req); /* Reset the RTT counters == "slow start" */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); pprintk("RPC: %lu timeout\n", jiffies); status = -ETIMEDOUT; } @@ -298,10 +298,10 @@ xprt_socket_autoclose(void *args) void xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); rpc_wake_up_status(&xprt->pending, -ENOTCONN); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } static void @@ -309,12 +309,12 @@ xprt_init_autodisconnect(unsigned long data) { struct rpc_xprt *xprt = (struct rpc_xprt *)data; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) goto out_abort; - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); /* Let keventd close the socket */ if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) xprt_release_write(xprt, NULL); @@ -322,7 +322,7 @@ xprt_init_autodisconnect(unsigned long data) schedule_work(&xprt->task_cleanup); return; out_abort: - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } /** @@ -482,7 +482,7 @@ xprt_timer(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (req->rq_received) goto out; @@ -496,7 +496,7 @@ xprt_timer(struct rpc_task *task) out: task->tk_timeout = 0; rpc_wake_up_task(task); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } /** @@ -515,7 +515,7 @@ int xprt_prepare_transmit(struct rpc_task *task) if (xprt->shutdown) return -EIO; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (req->rq_received && !req->rq_bytes_sent) { err = req->rq_received; goto out_unlock; @@ -530,7 +530,7 @@ int xprt_prepare_transmit(struct rpc_task *task) goto out_unlock; } out_unlock: - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return err; } @@ -552,13 +552,13 @@ void xprt_transmit(struct rpc_task *task) smp_rmb(); if (!req->rq_received) { if (list_empty(&req->rq_list)) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -592,7 +592,7 @@ void xprt_transmit(struct rpc_task *task) out_receive: dprintk("RPC: %4d xmit complete\n", task->tk_pid); /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (!xprt->nocong) { int timer = task->tk_msg.rpc_proc->p_timer; task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); @@ -607,7 +607,7 @@ void xprt_transmit(struct rpc_task *task) else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) @@ -683,7 +683,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); __xprt_release_write(xprt, task); __xprt_put_cong(xprt, req); if (!list_empty(&req->rq_list)) @@ -692,7 +692,7 @@ void xprt_release(struct rpc_task *task) if (list_empty(&xprt->recv) && !xprt->shutdown) mod_timer(&xprt->timer, xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ @@ -750,7 +750,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc return ERR_PTR(result); } - spin_lock_init(&xprt->sock_lock); + spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->xprt_lock); init_waitqueue_head(&xprt->cong_wait); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a5a04203a6b0..bc90caab6088 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -307,7 +307,7 @@ static int xs_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { /* Protect against races with xs_write_space */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; @@ -315,7 +315,7 @@ static int xs_send_request(struct rpc_task *task) task->tk_timeout = req->rq_timeout; rpc_sleep_on(&xprt->pending, task, NULL, NULL); } - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return status; } /* Keep holding the socket if it is blocked */ @@ -415,7 +415,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto dropit; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -436,7 +436,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) xprt_complete_rqst(xprt, rovr, copied); out_unlock: - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); dropit: skb_free_datagram(sk, skb); out: @@ -531,13 +531,13 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc ssize_t r; /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); req = xprt_lookup_rqst(xprt, xprt->tcp_xid); if (!req) { xprt->tcp_flags &= ~XPRT_COPY_DATA; dprintk("RPC: XID %08x request not found!\n", ntohl(xprt->tcp_xid)); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); return; } @@ -597,7 +597,7 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc req->rq_task->tk_pid); xprt_complete_rqst(xprt, req, xprt->tcp_copied); } - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } @@ -696,7 +696,7 @@ static void xs_tcp_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_ESTABLISHED: - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { /* Reset TCP record info */ xprt->tcp_offset = 0; @@ -705,7 +705,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; rpc_wake_up(&xprt->pending); } - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); break; case TCP_SYN_SENT: case TCP_SYN_RECV: @@ -753,10 +753,10 @@ static void xs_write_space(struct sock *sk) if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) goto out; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (xprt->snd_task) rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); out: read_unlock(&sk->sk_callback_lock); } From 5dc07727f86b25851e95193a0c484ea21b531c47 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:35 -0400 Subject: [PATCH 013/126] [PATCH] RPC: Rename xprt_lock Clean-up: Replace the xprt_lock with something more aptly named. This lock single-threads the XID and request slot reservation process. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:26 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/xprt.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c4f903f0e17c..41ce296dded1 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -199,7 +199,7 @@ struct rpc_xprt { * Send stuff */ spinlock_t transport_lock; /* lock transport info */ - spinlock_t xprt_lock; /* lock xprt info */ + spinlock_t reserve_lock; /* lock slot table */ struct rpc_task * snd_task; /* Task blocked in send */ struct list_head recv; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1f0da8c1a3b0..9c45c522e3ef 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -643,9 +643,9 @@ void xprt_reserve(struct rpc_task *task) task->tk_status = -EIO; if (!xprt->shutdown) { - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); do_xprt_reserve(task); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } } @@ -698,10 +698,10 @@ void xprt_release(struct rpc_task *task) dprintk("RPC: %4d release request %p\n", task->tk_pid, req); - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); xprt_clear_backlog(xprt); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } /** @@ -751,7 +751,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc } spin_lock_init(&xprt->transport_lock); - spin_lock_init(&xprt->xprt_lock); + spin_lock_init(&xprt->reserve_lock); init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); From 2226feb6bcd0e5e117a9be3ea3dd3ffc14f3e41e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:38 -0400 Subject: [PATCH 014/126] [PATCH] RPC: rename the sockstate field Clean-up: get rid of a name reference to sockets in the generic parts of the RPC client by renaming the sockstate field in the rpc_xprt structure. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:53 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 58 ++++++++++++++++++++++++++++++------- net/sunrpc/xprt.c | 14 ++++----- net/sunrpc/xprtsock.c | 6 ++-- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 41ce296dded1..009a3bb4f997 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -163,7 +163,7 @@ struct rpc_xprt { struct list_head free; /* free slots */ struct rpc_rqst * slot; /* slot table storage */ unsigned int max_reqs; /* total slots */ - unsigned long sockstate; /* Socket state */ + unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ nocong : 1, /* no congestion control */ resvport : 1, /* use a reserved port */ @@ -240,16 +240,54 @@ int xs_setup_udp(struct rpc_xprt *, int xs_setup_tcp(struct rpc_xprt *, struct rpc_timeout *); -#define XPRT_LOCKED 0 -#define XPRT_CONNECT 1 -#define XPRT_CONNECTING 2 +/* + * Reserved bit positions in xprt->state + */ +#define XPRT_LOCKED (0) +#define XPRT_CONNECTED (1) +#define XPRT_CONNECTING (2) -#define xprt_connected(xp) (test_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_set_connected(xp) (set_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_test_and_set_connected(xp) (test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_test_and_clear_connected(xp) \ - (test_and_clear_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_clear_connected(xp) (clear_bit(XPRT_CONNECT, &(xp)->sockstate)) +static inline void xprt_set_connected(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline void xprt_clear_connected(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline int xprt_connected(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline int xprt_test_and_set_connected(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline int xprt_test_and_clear_connected(struct rpc_xprt *xprt) +{ + return test_and_clear_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline void xprt_clear_connecting(struct rpc_xprt *xprt) +{ + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTING, &xprt->state); + smp_mb__after_clear_bit(); +} + +static inline int xprt_connecting(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_CONNECTING, &xprt->state); +} + +static inline int xprt_test_and_set_connecting(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_CONNECTING, &xprt->state); +} #endif /* __KERNEL__*/ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9c45c522e3ef..57c5e77b155e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -74,7 +74,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; @@ -88,7 +88,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; } smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); @@ -118,7 +118,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) { struct rpc_task *task; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) goto out_unlock; @@ -139,7 +139,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) } out_unlock: smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); } @@ -152,7 +152,7 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) if (xprt->snd_task == task) { xprt->snd_task = NULL; smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); __xprt_lock_write_next(xprt); } @@ -312,11 +312,11 @@ xprt_init_autodisconnect(unsigned long data) spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); /* Let keventd close the socket */ - if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) + if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else schedule_work(&xprt->task_cleanup); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bc90caab6088..76a33b54f436 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -925,9 +925,7 @@ static void xs_connect_worker(void *args) else rpc_wake_up(&xprt->pending); out_clear: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTING, &xprt->sockstate); - smp_mb__after_clear_bit(); + xprt_clear_connecting(xprt); } /** @@ -940,7 +938,7 @@ static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + if (!xprt_test_and_set_connecting(xprt)) { if (xprt->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p\n", xprt); schedule_delayed_work(&xprt->sock_connect, From 86b9f57dfdf455763d2be73a742a9a88bb664173 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:41 -0400 Subject: [PATCH 015/126] [PATCH] RPC: Eliminate socket.h includes in RPC client Clean-up: get rid of unnecessary socket.h and in.h includes in the generic parts of the RPC client. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:06:23 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth.c | 1 - net/sunrpc/auth_gss/auth_gss.c | 2 -- net/sunrpc/auth_gss/gss_krb5_mech.c | 1 - net/sunrpc/auth_gss/gss_mech_switch.c | 1 - net/sunrpc/auth_null.c | 2 -- net/sunrpc/auth_unix.c | 2 -- net/sunrpc/clnt.c | 1 - net/sunrpc/sunrpc_syms.c | 1 - 8 files changed, 11 deletions(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 505e2d4b3d62..a415d99c394d 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 2f7b867161d2..53a030acdf75 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -42,8 +42,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 606a8a82cafb..462c5b86b073 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 9dfb68377d69..58aeaddd8c79 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 9b72d3abf823..f56767aaa927 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -7,9 +7,7 @@ */ #include -#include #include -#include #include #include #include diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4ff297a9b15b..890fb5ea0dcb 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -9,8 +9,6 @@ #include #include #include -#include -#include #include #include diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ab50c3c9e6a8..0d1b010a4a01 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ed48ff022d35..2387e7b823ff 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include From 44fbac2288dfed6f1963ac00bf922c3bcd779cd1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:44 -0400 Subject: [PATCH 016/126] [PATCH] RPC: Add helper for waking tasks pending on a transport Clean-up: remove only reference to xprt->pending from the socket transport implementation. This makes a cleaner interface for other transport implementations as well. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:06:52 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 18 ++++++++++++++++-- net/sunrpc/xprtsock.c | 7 ++----- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 009a3bb4f997..d5223993fca9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -232,6 +232,7 @@ void xprt_reserve(struct rpc_task *); int xprt_prepare_transmit(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); +void xprt_wake_pending_tasks(struct rpc_xprt *, int); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release(struct rpc_task *); void xprt_connect(struct rpc_task *); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 57c5e77b155e..2f9cd468b953 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -227,6 +227,20 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) xprt->cwnd = cwnd; } +/** + * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue + * @xprt: transport with waiting tasks + * @status: result code to plant in each task before waking it + * + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) +{ + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -300,7 +314,7 @@ void xprt_disconnect(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - rpc_wake_up_status(&xprt->pending, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } @@ -803,7 +817,7 @@ static void xprt_shutdown(struct rpc_xprt *xprt) xprt->shutdown = 1; rpc_wake_up(&xprt->sending); rpc_wake_up(&xprt->resend); - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 76a33b54f436..182da2edf61c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -703,7 +703,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); break; @@ -920,10 +920,7 @@ static void xs_connect_worker(void *args) } } out: - if (status < 0) - rpc_wake_up_status(&xprt->pending, status); - else - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); } From 55aa4f58aa43dc9a51fb80010630d94b96053a2e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:47 -0400 Subject: [PATCH 017/126] [PATCH] RPC: client-side transport switch cleanup Clean-up: change some comments to reflect the realities of the new RPC transport switch mechanism. Get rid of unused xprt_receive() prototype. Also, organize function prototypes in xprt.h by usage and scope. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:07:21 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 61 +++++++++++++++++++++---------------- net/sunrpc/clnt.c | 2 +- net/sunrpc/xprt.c | 26 ++++++++-------- net/sunrpc/xprtsock.c | 12 +++++--- 4 files changed, 55 insertions(+), 46 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d5223993fca9..bfbc492ae36d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -1,5 +1,5 @@ /* - * linux/include/linux/sunrpc/clnt_xprt.h + * linux/include/linux/sunrpc/xprt.h * * Declarations for the RPC transport interface. * @@ -150,8 +150,8 @@ struct rpc_xprt { unsigned long cong; /* current congestion */ unsigned long cwnd; /* congestion window */ - unsigned int rcvsize, /* socket receive buffer size */ - sndsize; /* socket send buffer size */ + unsigned int rcvsize, /* transport rcv buffer size */ + sndsize; /* transport send buffer size */ size_t max_payload; /* largest RPC payload size, in bytes */ @@ -184,12 +184,12 @@ struct rpc_xprt { unsigned long tcp_copied, /* copied to request */ tcp_flags; /* - * Connection of sockets + * Connection of transports */ - struct work_struct sock_connect; + struct work_struct connect_worker; unsigned short port; /* - * Disconnection of idle sockets + * Disconnection of idle transports */ struct work_struct task_cleanup; struct timer_list timer; @@ -219,27 +219,36 @@ struct rpc_xprt { #ifdef __KERNEL__ -struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, - struct rpc_timeout *toparms); -void xprt_disconnect(struct rpc_xprt *); -int xprt_destroy(struct rpc_xprt *); -void xprt_set_timeout(struct rpc_timeout *, unsigned int, - unsigned long); -struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *, u32); -void xprt_complete_rqst(struct rpc_xprt *, - struct rpc_rqst *, int); -void xprt_reserve(struct rpc_task *); -int xprt_prepare_transmit(struct rpc_task *); -void xprt_transmit(struct rpc_task *); -void xprt_receive(struct rpc_task *); -void xprt_wake_pending_tasks(struct rpc_xprt *, int); +/* + * Transport operations used by ULPs + */ +struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, struct rpc_timeout *to); +void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr); + +/* + * Generic internal transport functions + */ +void xprt_connect(struct rpc_task *task); +void xprt_reserve(struct rpc_task *task); +int xprt_prepare_transmit(struct rpc_task *task); +void xprt_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); -void xprt_release(struct rpc_task *); -void xprt_connect(struct rpc_task *); -int xs_setup_udp(struct rpc_xprt *, - struct rpc_timeout *); -int xs_setup_tcp(struct rpc_xprt *, - struct rpc_timeout *); +void xprt_release(struct rpc_task *task); +int xprt_destroy(struct rpc_xprt *xprt); + +/* + * Transport switch helper functions + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); +struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); +void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); +void xprt_disconnect(struct rpc_xprt *xprt); + +/* + * Socket transport setup operations + */ +int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to); +int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to); /* * Reserved bit positions in xprt->state diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0d1b010a4a01..4677959d2834 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1,5 +1,5 @@ /* - * linux/net/sunrpc/rpcclnt.c + * linux/net/sunrpc/clnt.c * * This file contains the high-level RPC interface. * It is modeled as a finite state machine to support both synchronous diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2f9cd468b953..247fa1ec870c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -10,12 +10,12 @@ * one is available. Otherwise, it sleeps on the backlog queue * (xprt_reserve). * - Next, the caller puts together the RPC message, stuffs it into - * the request struct, and calls xprt_call(). - * - xprt_call transmits the message and installs the caller on the - * socket's wait list. At the same time, it installs a timer that + * the request struct, and calls xprt_transmit(). + * - xprt_transmit sends the message and installs the caller on the + * transport's wait list. At the same time, it installs a timer that * is run after the packet's timeout has expired. * - When a packet arrives, the data_ready handler walks the list of - * pending requests for that socket. If a matching XID is found, the + * pending requests for that transport. If a matching XID is found, the * caller is woken up, and the timer removed. * - When no reply arrives within the timeout interval, the timer is * fired by the kernel and runs xprt_timer(). It either adjusts the @@ -32,6 +32,8 @@ * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch + * + * Transport switch API copyright (C) 2005, Chuck Lever */ #include @@ -52,8 +54,6 @@ # define RPCDBG_FACILITY RPCDBG_XPRT #endif -#define XPRT_MAX_BACKOFF (8) - /* * Local functions */ @@ -65,9 +65,9 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); /* - * Serialize write access to sockets, in order to prevent different + * Serialize write access to transports, in order to prevent different * requests from interfering with each other. - * Also prevents TCP socket connects from colliding with writes. + * Also prevents transport connects from colliding with writes. */ static int __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) @@ -91,7 +91,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: - dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); + dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; if (req && req->rq_ntrans) @@ -144,7 +144,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) } /* - * Releases the socket for use by other requests. + * Releases the transport for use by other requests. */ static void __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) @@ -294,8 +294,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req) return status; } -static void -xprt_socket_autoclose(void *args) +static void xprt_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; @@ -329,7 +328,6 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - /* Let keventd close the socket */ if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else @@ -770,7 +768,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); - INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); + INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 182da2edf61c..7f0b9f7f167b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -11,6 +11,8 @@ * Rewrite of larges part of the code in order to stabilize TCP stuff. * Fix behaviour when socket buffer is full. * (C) 1999 Trond Myklebust + * + * IP socket transport implementation, (C) 2005 Chuck Lever */ #include @@ -363,7 +365,7 @@ static void xs_destroy(struct rpc_xprt *xprt) { dprintk("RPC: xs_destroy xprt %p\n", xprt); - cancel_delayed_work(&xprt->sock_connect); + cancel_delayed_work(&xprt->connect_worker); flush_scheduled_work(); xprt_disconnect(xprt); @@ -938,11 +940,11 @@ static void xs_connect(struct rpc_task *task) if (!xprt_test_and_set_connecting(xprt)) { if (xprt->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p\n", xprt); - schedule_delayed_work(&xprt->sock_connect, + schedule_delayed_work(&xprt->connect_worker, RPC_REESTABLISH_TIMEOUT); } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - schedule_work(&xprt->sock_connect); + schedule_work(&xprt->connect_worker); /* flush_scheduled_work can sleep... */ if (!RPC_IS_ASYNC(task)) flush_scheduled_work(); @@ -989,7 +991,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); xprt->ops = &xs_ops; @@ -1028,7 +1030,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); xprt->ops = &xs_ops; From c7b2cae8a634015b72941ba2fc6c4bc9b8d3a129 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:50 -0400 Subject: [PATCH 018/126] [PATCH] RPC: separate TCP and UDP write space callbacks Split the socket write space callback function into a TCP version and UDP version, eliminating one dependence on the "xprt->stream" variable. Keep the common pieces of this path in xprt.c so other transports can use it too. Test-plan: Write-intensive workload on a single mount point. Version: Thu, 11 Aug 2005 16:07:51 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 + net/sunrpc/xprt.c | 34 +++++++++++++++ net/sunrpc/xprtsock.c | 84 +++++++++++++++++++++++-------------- 3 files changed, 89 insertions(+), 31 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bfbc492ae36d..e73174c7e450 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -240,6 +240,8 @@ int xprt_destroy(struct rpc_xprt *xprt); * Transport switch helper functions */ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); +void xprt_wait_for_buffer_space(struct rpc_task *task); +void xprt_write_space(struct rpc_xprt *xprt); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); void xprt_disconnect(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 247fa1ec870c..31ef7dc7eed6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -241,6 +241,40 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) rpc_wake_up(&xprt->pending); } +/** + * xprt_wait_for_buffer_space - wait for transport output buffer to clear + * @task: task to be put to sleep + * + */ +void xprt_wait_for_buffer_space(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); +} + +/** + * xprt_write_space - wake the task waiting for transport output buffer space + * @xprt: transport with waiting tasks + * + * Can be called in a soft IRQ context, so xprt_write_space never sleeps. + */ +void xprt_write_space(struct rpc_xprt *xprt) +{ + if (unlikely(xprt->shutdown)) + return; + + spin_lock_bh(&xprt->transport_lock); + if (xprt->snd_task) { + dprintk("RPC: write space: waking waiting task on xprt %p\n", + xprt); + rpc_wake_up_task(xprt->snd_task); + } + spin_unlock_bh(&xprt->transport_lock); +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7f0b9f7f167b..70a772d7a796 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -308,15 +308,13 @@ static int xs_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xs_write_space */ + /* Protect against races with write_space */ spin_lock_bh(&xprt->transport_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); - } + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); spin_unlock_bh(&xprt->transport_lock); return status; } @@ -721,45 +719,68 @@ static void xs_tcp_state_change(struct sock *sk) } /** - * xs_write_space - callback invoked when socket buffer space becomes - * available + * xs_udp_write_space - callback invoked when socket buffer space + * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" - * progress, otherwise we'll waste resources thrashing sock_sendmsg + * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ -static void xs_write_space(struct sock *sk) +static void xs_udp_write_space(struct sock *sk) { - struct rpc_xprt *xprt; - struct socket *sock; - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) - goto out; - if (xprt->shutdown) - goto out; - /* Wait until we have enough socket memory */ - if (xprt->stream) { - /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + /* from net/core/sock.c:sock_def_write_space */ + if (sock_writeable(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) goto out; - } else { - /* from net/core/sock.c:sock_def_write_space */ - if (!sock_writeable(sk)) + if (unlikely(!(xprt = xprt_from_sock(sk)))) goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + goto out; + + xprt_write_space(xprt); } - if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - goto out; + out: + read_unlock(&sk->sk_callback_lock); +} - spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) - rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->transport_lock); -out: +/** + * xs_tcp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_tcp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + goto out; + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + goto out; + + xprt_write_space(xprt); + } + + out: read_unlock(&sk->sk_callback_lock); } @@ -855,15 +876,16 @@ static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) xprt->old_write_space = sk->sk_write_space; if (xprt->prot == IPPROTO_UDP) { sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; xprt_set_connected(xprt); } else { tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; xprt_clear_connected(xprt); } - sk->sk_write_space = xs_write_space; /* Reset to new socket */ xprt->sock = sock; From b0d93ad511ce2f37823a07c7a3258117a431f5fb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:53 -0400 Subject: [PATCH 019/126] [PATCH] RPC: separate TCP and UDP transport connection logic Create separate connection worker functions for managing UDP and TCP transport sockets. This eliminates several dependencies on "xprt->stream". Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with v2, v3, and v4. Version: Thu, 11 Aug 2005 16:08:18 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 198 +++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 90 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 70a772d7a796..f91529787b9b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -836,102 +836,118 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) return err; } -static struct socket *xs_create(struct rpc_xprt *xprt, int proto, int resvport) -{ - struct socket *sock; - int type, err; - - dprintk("RPC: xs_create(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); - - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - dprintk("RPC: can't create socket (%d).\n", -err); - return NULL; - } - - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xs_bindresvport(xprt, sock) < 0) - goto failed; - - return sock; - -failed: - sock_release(sock); - return NULL; -} - -static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (xprt->inet) - return; - - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = xprt; - xprt->old_data_ready = sk->sk_data_ready; - xprt->old_state_change = sk->sk_state_change; - xprt->old_write_space = sk->sk_write_space; - if (xprt->prot == IPPROTO_UDP) { - sk->sk_data_ready = xs_udp_data_ready; - sk->sk_write_space = xs_udp_write_space; - sk->sk_no_check = UDP_CSUM_NORCV; - xprt_set_connected(xprt); - } else { - tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = xs_tcp_data_ready; - sk->sk_state_change = xs_tcp_state_change; - sk->sk_write_space = xs_tcp_write_space; - xprt_clear_connected(xprt); - } - - /* Reset to new socket */ - xprt->sock = sock; - xprt->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); - - return; -} - /** - * xs_connect_worker - try to connect a socket to a remote endpoint + * xs_udp_connect_worker - set up a UDP socket * @args: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_connect_worker(void *args) +static void xs_udp_connect_worker(void *args) { - struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct rpc_xprt *xprt = (struct rpc_xprt *) args; struct socket *sock = xprt->sock; - int status = -EIO; + int err, status = -EIO; if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; - dprintk("RPC: xs_connect_worker xprt %p\n", xprt); + dprintk("RPC: xs_udp_connect_worker for xprt %p\n", xprt); - /* - * Start by resetting any existing state - */ + /* Start by resetting any existing state */ xs_close(xprt); - sock = xs_create(xprt, xprt->prot, xprt->resvport); - if (sock == NULL) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ + + if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } - xs_bind(xprt, sock); - xs_set_buffer_size(xprt); + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + + if (!xprt->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; + sk->sk_no_check = UDP_CSUM_NORCV; + + xprt_set_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + xs_set_buffer_size(xprt); status = 0; - if (!xprt->stream) +out: + xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); +} + +/** + * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_tcp_connect_worker(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int err, status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; - /* - * Tell the socket layer to start connecting... - */ + dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt); + + /* Start by resetting any existing socket state */ + xs_close(xprt); + + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } + + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + + if (!xprt->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; + tcp_sk(sk)->nonagle = 1; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, sizeof(xprt->addr), O_NONBLOCK); dprintk("RPC: %p connect status %d connected %d sock state %d\n", @@ -959,18 +975,20 @@ static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (!xprt_test_and_set_connecting(xprt)) { - if (xprt->sock != NULL) { - dprintk("RPC: xs_connect delayed xprt %p\n", xprt); - schedule_delayed_work(&xprt->connect_worker, + if (xprt_test_and_set_connecting(xprt)) + return; + + if (xprt->sock != NULL) { + dprintk("RPC: xs_connect delayed xprt %p\n", xprt); + schedule_delayed_work(&xprt->connect_worker, RPC_REESTABLISH_TIMEOUT); - } else { - dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - schedule_work(&xprt->connect_worker); - /* flush_scheduled_work can sleep... */ - if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); - } + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); + schedule_work(&xprt->connect_worker); + + /* flush_scheduled_work can sleep... */ + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); } } @@ -1013,7 +1031,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); xprt->ops = &xs_ops; @@ -1052,7 +1070,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); xprt->ops = &xs_ops; From 262965f53defd312a294b45366ea17907b6a616b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:56 -0400 Subject: [PATCH 020/126] [PATCH] RPC: separate TCP and UDP socket write paths Split the RPC client's main socket write path into a TCP version and a UDP version to eliminate another dependency on the "xprt->stream" variable. Compiler optimization removes unneeded code from xs_sendpages, as this function is now called with some constant arguments. We can now cleanly perform transport protocol-specific return code testing and error recovery in each path. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Examine oprofile results for any changes before and after this patch is applied. Version: Thu, 11 Aug 2005 16:08:46 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 215 +++++++++++++++++++++++++----------------- 1 file changed, 128 insertions(+), 87 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f91529787b9b..57988300640a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -40,6 +40,12 @@ */ #define XS_MAX_RESVPORT (800U) +/* + * How many times to try sending a request on a socket before waiting + * for the socket buffer to clear. + */ +#define XS_SENDMSG_RETRY (10U) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS @@ -114,13 +120,18 @@ static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int b * @base: starting position in the buffer * */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) +static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; int err, ret = 0; ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + if (unlikely(!sock)) + return -ENOTCONN; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + len = xdr->head[0].iov_len; if (base < len || (addr != NULL && base == 0)) { err = xs_send_head(sock, addr, addrlen, xdr, base, len); @@ -187,140 +198,162 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, } /** - * xs_sendmsg - write an RPC request to a socket - * @xprt: generic transport - * @req: the RPC request to write + * xs_nospace - place task on wait queue if transmit was incomplete + * @task: task to put to sleep * */ -static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +static void xs_nospace(struct rpc_task *task) { - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; - struct sockaddr *addr = NULL; - int addrlen = 0; - unsigned int skip; - int result; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - if (!sock) - return -ENOTCONN; + dprintk("RPC: %4d xmit incomplete (%u left of %u)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with write_space */ + spin_lock_bh(&xprt->transport_lock); + + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); + + spin_unlock_bh(&xprt->transport_lock); + } else + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); +} + +/** + * xs_udp_send_request - write an RPC request to a UDP socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent + */ +static int xs_udp_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); - /* For UDP, we need to provide an address */ - if (!xprt->stream) { - addr = (struct sockaddr *) &xprt->addr; - addrlen = sizeof(xprt->addr); - } - /* Don't repeat bytes */ - skip = req->rq_bytes_sent; + req->rq_xtime = jiffies; + status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), xdr, req->rq_bytes_sent); - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xs_sendpages(sock, addr, addrlen, xdr, skip); + dprintk("RPC: xs_udp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); - dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); + if (likely(status >= (int) req->rq_slen)) + return 0; - if (result >= 0) - return result; + /* Still some bytes left; set up for a retry later. */ + if (status > 0) + status = -EAGAIN; - switch (result) { + switch (status) { + case -ENETUNREACH: + case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - case -EAGAIN: break; - case -ECONNRESET: - case -ENOTCONN: - case -EPIPE: - /* connection broken */ - if (xprt->stream) - result = -ENOTCONN; + case -EAGAIN: + xs_nospace(task); break; default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); break; } - return result; + + return status; } /** - * xs_send_request - write an RPC request to a socket + * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request * * Return values: - * 0: The request has been sent - * EAGAIN: The socket was blocked, please call again later to - * complete the request - * other: Some other error occured, the request was not sent + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent * * XXX: In the case of soft timeouts, should we eventually give up - * if the socket is not able to make progress? + * if sendmsg is not able to make progress? */ -static int xs_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; + u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; - /* set up everything as needed. */ /* Write the record marker */ - if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; + *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - } + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have - * called sendmsg(). - */ + * called sendmsg(). */ while (1) { req->rq_xtime = jiffies; - status = xs_sendmsg(xprt, req); + status = xs_sendpages(xprt->sock, NULL, 0, xdr, + req->rq_bytes_sent); - if (status < 0) + dprintk("RPC: xs_tcp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); + + if (unlikely(status < 0)) break; - if (xprt->stream) { - req->rq_bytes_sent += status; - - /* If we've sent the entire packet, immediately - * reset the count of bytes sent. */ - if (req->rq_bytes_sent >= req->rq_slen) { - req->rq_bytes_sent = 0; - return 0; - } - } else { - if (status >= req->rq_slen) - return 0; - status = -EAGAIN; - break; + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; } - dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, - req->rq_slen); - status = -EAGAIN; - if (retry++ > 50) + if (retry++ > XS_SENDMSG_RETRY) break; } - if (status == -EAGAIN) { - if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with write_space */ - spin_lock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) - xprt_wait_for_buffer_space(task); - spin_unlock_bh(&xprt->transport_lock); - return status; - } - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); + switch (status) { + case -EAGAIN: + xs_nospace(task); + break; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + status = -ENOTCONN; + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + break; } + return status; } @@ -992,10 +1025,18 @@ static void xs_connect(struct rpc_task *task) } } -static struct rpc_xprt_ops xs_ops = { +static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_set_buffer_size, .connect = xs_connect, - .send_request = xs_send_request, + .send_request = xs_udp_send_request, + .close = xs_close, + .destroy = xs_destroy, +}; + +static struct rpc_xprt_ops xs_tcp_ops = { + .set_buffer_size = xs_set_buffer_size, + .connect = xs_connect, + .send_request = xs_tcp_send_request, .close = xs_close, .destroy = xs_destroy, }; @@ -1033,7 +1074,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); - xprt->ops = &xs_ops; + xprt->ops = &xs_udp_ops; if (to) xprt->timeout = *to; @@ -1072,7 +1113,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); - xprt->ops = &xs_ops; + xprt->ops = &xs_tcp_ops; if (to) xprt->timeout = *to; From 808012fbb23a52ec59352445d2076d175ad4ab26 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:49 -0700 Subject: [PATCH 021/126] [PATCH] RPC: skip over transport-specific heads automatically Add a generic mechanism for skipping over transport-specific headers when constructing an RPC request. This removes another "xprt->stream" dependency. Test-plan: Write-intensive workload on a single mount point (try both UDP and TCP). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/msg_prot.h | 25 +++++++++++++++++++++++++ include/linux/sunrpc/xprt.h | 7 +++++++ net/sunrpc/auth_gss/auth_gss.c | 6 ++---- net/sunrpc/clnt.c | 5 ++--- net/sunrpc/xprtsock.c | 24 +++++++++++++++++------- 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 15f115332389..f43f237360ae 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -76,5 +76,30 @@ enum rpc_auth_stat { #define RPC_MAXNETNAMELEN 256 +/* + * From RFC 1831: + * + * "A record is composed of one or more record fragments. A record + * fragment is a four-byte header followed by 0 to (2**31) - 1 bytes of + * fragment data. The bytes encode an unsigned binary number; as with + * XDR integers, the byte order is from highest to lowest. The number + * encodes two values -- a boolean which indicates whether the fragment + * is the last fragment of the record (bit value 1 implies the fragment + * is the last fragment) and a 31-bit unsigned binary value which is the + * length in bytes of the fragment's data. The boolean value is the + * highest-order bit of the header; the length is the 31 low-order bits. + * (Note that this record specification is NOT in XDR standard form!)" + * + * The Linux RPC client always sends its requests in a single record + * fragment, limiting the maximum payload size for stream transports to + * 2GB. + */ + +typedef u32 rpc_fraghdr; + +#define RPC_LAST_STREAM_FRAGMENT (1U << 31) +#define RPC_FRAGMENT_SIZE_MASK (~RPC_LAST_STREAM_FRAGMENT) +#define RPC_MAX_FRAGMENT_SIZE ((1U << 31) - 1) + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_MSGPROT_H_ */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index e73174c7e450..966c456a0f6d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -155,6 +155,8 @@ struct rpc_xprt { size_t max_payload; /* largest RPC payload size, in bytes */ + unsigned int tsh_size; /* size of transport specific + header */ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue resend; /* requests waiting to resend */ @@ -236,6 +238,11 @@ int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release(struct rpc_task *task); int xprt_destroy(struct rpc_xprt *xprt); +static inline u32 *xprt_skip_transport_header(struct rpc_xprt *xprt, u32 *p) +{ + return p + xprt->tsh_size; +} + /* * Transport switch helper functions */ diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 53a030acdf75..d2b08f16c257 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -844,10 +844,8 @@ gss_marshal(struct rpc_task *task, u32 *p) /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ - iov.iov_base = req->rq_snd_buf.head[0].iov_base; - if (task->tk_client->cl_xprt->stream) - /* See clnt.c:call_header() */ - iov.iov_base += 4; + iov.iov_base = xprt_skip_transport_header(task->tk_xprt, + req->rq_snd_buf.head[0].iov_base); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4677959d2834..cc1b773a79d3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1075,13 +1075,12 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; /* FIXME: check buffer size? */ - if (xprt->stream) - *p++ = 0; /* fill in later */ + + p = xprt_skip_transport_header(task->tk_xprt, p); *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 57988300640a..aaf053b1a0c4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -282,6 +282,13 @@ static int xs_udp_send_request(struct rpc_task *task) return status; } +static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); +} + /** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request @@ -301,11 +308,9 @@ static int xs_tcp_send_request(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct xdr_buf *xdr = &req->rq_snd_buf; - u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; - /* Write the record marker */ - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + xs_encode_tcp_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, @@ -503,16 +508,19 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc xprt->tcp_offset += used; if (used != len) return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) + if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) xprt->tcp_flags |= XPRT_LAST_FRAG; else xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + xprt->tcp_flags &= ~XPRT_COPY_RECM; xprt->tcp_offset = 0; + /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { + if (unlikely(xprt->tcp_reclen < 4)) { dprintk("RPC: invalid TCP record fragment length\n"); xprt_disconnect(xprt); return; @@ -1065,6 +1073,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = 0; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -1105,11 +1114,12 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - xprt->max_payload = (1U << 31) - 1; + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); From 43118c29dea2b23798bd42a147015cceee7fa885 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:49 -0700 Subject: [PATCH 022/126] [PATCH] RPC: get rid of xprt->stream Now we can fix up the last few places that use the "xprt->stream" variable, and get rid of it from the rpc_xprt structure. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 3 +-- include/linux/sunrpc/xprt.h | 3 +-- net/sunrpc/xprt.c | 3 +-- net/sunrpc/xprtsock.c | 28 ++++++++++++++++++---------- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 82c77df81c5f..7901f5b8092c 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -173,11 +173,10 @@ nlm_bind_host(struct nlm_host *host) /* If we've already created an RPC client, check whether * RPC rebind is required - * Note: why keep rebinding if we're on a tcp connection? */ if ((clnt = host->h_rpcclnt) != NULL) { xprt = clnt->cl_xprt; - if (!xprt->stream && time_after_eq(jiffies, host->h_nextrebind)) { + if (time_after_eq(jiffies, host->h_nextrebind)) { clnt->cl_port = 0; host->h_nextrebind = jiffies + NLM_HOST_REBIND; dprintk("lockd: next rebind in %ld jiffies\n", diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 966c456a0f6d..c9477f022efb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -168,8 +168,7 @@ struct rpc_xprt { unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ nocong : 1, /* no congestion control */ - resvport : 1, /* use a reserved port */ - stream : 1; /* TCP */ + resvport : 1; /* use a reserved port */ /* * XID diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 31ef7dc7eed6..43fef7626442 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -630,8 +630,7 @@ void xprt_transmit(struct rpc_task *task) case -ENOTCONN: return; default: - if (xprt->stream) - xprt_disconnect(xprt); + break; } xprt_release_write(xprt, task); return; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index aaf053b1a0c4..5bb6fed3df34 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -356,6 +356,7 @@ static int xs_tcp_send_request(struct rpc_task *task) default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + xprt_disconnect(xprt); break; } @@ -826,19 +827,17 @@ static void xs_tcp_write_space(struct sock *sk) } /** - * xs_set_buffer_size - set send and receive limits + * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport * * Set socket send and receive limits based on the * sndsize and rcvsize fields in the generic transport - * structure. This applies only to UDP sockets. + * structure. */ -static void xs_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; - if (xprt->stream) - return; if (xprt->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; @@ -850,6 +849,17 @@ static void xs_set_buffer_size(struct rpc_xprt *xprt) } } +/** + * xs_tcp_set_buffer_size - set send and receive limits + * @xprt: generic transport + * + * Nothing to do for TCP. + */ +static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) +{ + return; +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -928,7 +938,7 @@ static void xs_udp_connect_worker(void *args) write_unlock_bh(&sk->sk_callback_lock); } - xs_set_buffer_size(xprt); + xs_udp_set_buffer_size(xprt); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1034,7 +1044,7 @@ static void xs_connect(struct rpc_task *task) } static struct rpc_xprt_ops xs_udp_ops = { - .set_buffer_size = xs_set_buffer_size, + .set_buffer_size = xs_udp_set_buffer_size, .connect = xs_connect, .send_request = xs_udp_send_request, .close = xs_close, @@ -1042,7 +1052,7 @@ static struct rpc_xprt_ops xs_udp_ops = { }; static struct rpc_xprt_ops xs_tcp_ops = { - .set_buffer_size = xs_set_buffer_size, + .set_buffer_size = xs_tcp_set_buffer_size, .connect = xs_connect, .send_request = xs_tcp_send_request, .close = xs_close, @@ -1074,7 +1084,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; @@ -1115,7 +1124,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; From fe3aca290f17ae4978bd73d02aa4029f1c9c024c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:50 -0700 Subject: [PATCH 023/126] [PATCH] RPC: add API to set transport-specific timeouts Prepare the way to remove the "xprt->nocong" variable by adding a callout to the RPC client transport switch API to handle setting RPC retransmit timeouts. Add a pair of generic helper functions that provide the ability to set a simple fixed timeout, or to set a timeout based on the state of a round- trip estimator. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++ net/sunrpc/xprt.c | 67 +++++++++++++++++++++++++------------ net/sunrpc/xprtsock.c | 2 ++ 3 files changed, 50 insertions(+), 22 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c9477f022efb..ac08e99a81cb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -134,6 +134,7 @@ struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); + void (*set_retrans_timeout)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); }; @@ -245,6 +246,8 @@ static inline u32 *xprt_skip_transport_header(struct rpc_xprt *xprt, u32 *p) /* * Transport switch helper functions */ +void xprt_set_retrans_timeout_def(struct rpc_task *task); +void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_task *task); void xprt_write_space(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 43fef7626442..1ac2fbe05102 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -275,6 +275,38 @@ void xprt_write_space(struct rpc_xprt *xprt) spin_unlock_bh(&xprt->transport_lock); } +/** + * xprt_set_retrans_timeout_def - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation. + */ +void xprt_set_retrans_timeout_def(struct rpc_task *task) +{ + task->tk_timeout = task->tk_rqstp->rq_timeout; +} + +/* + * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout using the RTT estimator. + */ +void xprt_set_retrans_timeout_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = req->rq_xprt->timeout.to_maxval; + + task->tk_timeout = rpc_calc_rto(rtt, timer); + task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (task->tk_timeout > max_timeout || task->tk_timeout == 0) + task->tk_timeout = max_timeout; +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -588,7 +620,6 @@ int xprt_prepare_transmit(struct rpc_task *task) */ void xprt_transmit(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int status; @@ -613,8 +644,19 @@ void xprt_transmit(struct rpc_task *task) return; status = xprt->ops->send_request(task); - if (!status) - goto out_receive; + if (status == 0) { + dprintk("RPC: %4d xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); + xprt->ops->set_retrans_timeout(task); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->transport_lock); + return; + } /* Note: at this point, task->tk_sleeping has not yet been set, * hence there is no danger of the waking up task being put on @@ -634,25 +676,6 @@ void xprt_transmit(struct rpc_task *task) } xprt_release_write(xprt, task); return; - out_receive: - dprintk("RPC: %4d xmit complete\n", task->tk_pid); - /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->transport_lock); - if (!xprt->nocong) { - int timer = task->tk_msg.rpc_proc->p_timer; - task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); - task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries; - if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0) - task->tk_timeout = xprt->timeout.to_maxval; - } else - task->tk_timeout = req->rq_timeout; - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5bb6fed3df34..79433ffd1df0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1047,6 +1047,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .connect = xs_connect, .send_request = xs_udp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .close = xs_close, .destroy = xs_destroy, }; @@ -1055,6 +1056,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, .connect = xs_connect, .send_request = xs_tcp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, .destroy = xs_destroy, }; From 12a804698b29d040b7cdd92e8a44b0e75164dae9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:51 -0700 Subject: [PATCH 024/126] [PATCH] RPC: expose API for serializing access to RPC transports The next several patches introduce an API that allows transports to choose whether the RPC client provides congestion control or whether the transport itself provides it. The first method we abstract is the one that serializes access to the RPC transport to prevent the bytes from different requests from mingling together. This method provides proper request serialization and the opportunity to prevent new requests from being started because the transport is congested. The normal situation is for the transport to handle congestion control itself. Although NFS over UDP was first, it has been recognized after years of experience that having the transport provide congestion control is much better than doing it in the RPC client. Thus TCP, and probably every future transport implementation, will use the default method, xprt_lock_write, provided in xprt.c, which does not provide any kind of congestion control. UDP can continue using the xprt.c-provided Van Jacobson congestion avoidance implementation. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++ net/sunrpc/xprt.c | 66 +++++++++++++++++++++++++++++-------- net/sunrpc/xprtsock.c | 2 ++ 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index ac08e99a81cb..eee1c6877851 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -132,6 +132,7 @@ struct rpc_xprt; struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); + int (*reserve_xprt)(struct rpc_task *task); void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); @@ -232,6 +233,8 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long */ void xprt_connect(struct rpc_task *task); void xprt_reserve(struct rpc_task *task); +int xprt_reserve_xprt(struct rpc_task *task); +int xprt_reserve_xprt_cong(struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1ac2fbe05102..2d1e8b83dd68 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -64,14 +64,56 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); -/* - * Serialize write access to transports, in order to prevent different - * requests from interfering with each other. - * Also prevents transport connects from colliding with writes. +/** + * xprt_reserve_xprt - serialize write access to transports + * @task: task that is requesting access to the transport + * + * This prevents mixing the payload of separate requests, and prevents + * transport connects from colliding with writes. No congestion control + * is provided. */ -static int -__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +int xprt_reserve_xprt(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { + if (task == xprt->snd_task) + return 1; + if (task == NULL) + return 0; + goto out_sleep; + } + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + +out_sleep: + dprintk("RPC: %4d failed to lock transport %p\n", + task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + return 0; +} + +/* + * xprt_reserve_xprt_cong - serialize write access to transports + * @task: task that is requesting access to the transport + * + * Same as xprt_reserve_xprt, but Van Jacobson congestion control is + * integrated into the decision of whether a request is allowed to be + * woken up and given access to the transport. + */ +int xprt_reserve_xprt_cong(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { @@ -79,7 +121,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; goto out_sleep; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { xprt->snd_task = task; if (req) { req->rq_bytes_sent = 0; @@ -101,20 +143,18 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 0; } -static inline int -xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; spin_lock_bh(&xprt->transport_lock); - retval = __xprt_lock_write(xprt, task); + retval = xprt->ops->reserve_xprt(task); spin_unlock_bh(&xprt->transport_lock); return retval; } -static void -__xprt_lock_write_next(struct rpc_xprt *xprt) +static void __xprt_lock_write_next(struct rpc_xprt *xprt) { struct rpc_task *task; @@ -598,7 +638,7 @@ int xprt_prepare_transmit(struct rpc_task *task) err = req->rq_received; goto out_unlock; } - if (!__xprt_lock_write(xprt, task)) { + if (!xprt->ops->reserve_xprt(task)) { err = -EAGAIN; goto out_unlock; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 79433ffd1df0..fc4fbe8ea346 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1045,6 +1045,7 @@ static void xs_connect(struct rpc_task *task) static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt_cong, .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, @@ -1054,6 +1055,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt, .connect = xs_connect, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, From 49e9a89086b3cae784a4868ca852863e4f4ea3fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:51 -0700 Subject: [PATCH 025/126] [PATCH] RPC: expose API for serializing access to RPC transports The next method we abstract is the one that releases a transport, allowing another task to have access to the transport. Again, one generic version of this is provided for transports that don't need the RPC client to perform congestion control, and one version is for transports that can use the original Van Jacobson implementation in xprt.c. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++ net/sunrpc/xprt.c | 77 ++++++++++++++++++++++++++++++------- net/sunrpc/xprtsock.c | 2 + 3 files changed, 68 insertions(+), 14 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index eee1c6877851..86833b725bb5 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -133,6 +133,7 @@ struct rpc_xprt; struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); int (*reserve_xprt)(struct rpc_task *task); + void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); @@ -238,6 +239,8 @@ int xprt_reserve_xprt_cong(struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_release(struct rpc_task *task); int xprt_destroy(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2d1e8b83dd68..e92ea99dd318 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -153,14 +153,42 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return retval; } - static void __xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + struct rpc_rqst *req; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + return; + + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + + req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + +out_unlock: + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); +} + +static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { struct rpc_task *task; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) + if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; task = rpc_wake_up_next(&xprt->resend); if (!task) { @@ -168,7 +196,7 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (!task) goto out_unlock; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { struct rpc_rqst *req = task->tk_rqstp; xprt->snd_task = task; if (req) { @@ -183,11 +211,14 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) smp_mb__after_clear_bit(); } -/* - * Releases the transport for use by other requests. +/** + * xprt_release_xprt - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. No congestion control is provided. */ -static void -__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt->snd_task = NULL; @@ -198,11 +229,29 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) } } -static inline void -xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +/** + * xprt_release_xprt_cong - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. Another task is awoken to use the + * transport if the transport's congestion window allows it. + */ +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt->snd_task = NULL; + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); + __xprt_lock_write_next_cong(xprt); + } +} + +static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { spin_lock_bh(&xprt->transport_lock); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); } @@ -237,7 +286,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } /* @@ -256,7 +305,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; if (cwnd > RPC_MAXCWND(xprt)) cwnd = RPC_MAXCWND(xprt); - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; if (cwnd < RPC_CWNDSCALE) @@ -693,7 +742,7 @@ void xprt_transmit(struct rpc_task *task) task->tk_status = -ENOTCONN; else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); return; } @@ -792,7 +841,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; spin_lock_bh(&xprt->transport_lock); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); __xprt_put_cong(xprt, req); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fc4fbe8ea346..8589c1ad55e3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1046,6 +1046,7 @@ static void xs_connect(struct rpc_task *task) static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, @@ -1056,6 +1057,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, .connect = xs_connect, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, From 46c0ee8bc4ad3743de05e8b8b20201df44dcb6d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:52 -0700 Subject: [PATCH 026/126] [PATCH] RPC: separate xprt_timer implementations Allow transports to hook the retransmit timer interrupt. Some transports calculate their congestion window here so that a retransmit timeout has immediate effect on the congestion window. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 45 +++++++++++++++++-------------------- net/sunrpc/xprtsock.c | 12 ++++++++++ 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 86833b725bb5..443c3f984cf9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -137,6 +137,7 @@ struct rpc_xprt_ops { void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); + void (*timer)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); }; @@ -257,6 +258,7 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_task *task); void xprt_write_space(struct rpc_xprt *xprt); +void xprt_adjust_cwnd(struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); void xprt_disconnect(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e92ea99dd318..ffc595592af3 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -289,16 +289,19 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) __xprt_lock_write_next_cong(xprt); } -/* - * Adjust RPC congestion window +/** + * xprt_adjust_cwnd - adjust transport congestion window + * @task: recently completed RPC request used to adjust window + * @result: result code of completed RPC request + * * We use a time-smoothed congestion estimator to avoid heavy oscillation. */ -static void -xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) +void xprt_adjust_cwnd(struct rpc_task *task, int result) { - unsigned long cwnd; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + unsigned long cwnd = xprt->cwnd; - cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ @@ -314,6 +317,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", xprt->cong, xprt->cwnd, cwnd); xprt->cwnd = cwnd; + __xprt_put_cong(xprt, req); } /** @@ -602,8 +606,7 @@ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) /* Adjust congestion window */ if (!xprt->nocong) { unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(xprt, copied); - __xprt_put_cong(xprt, req); + xprt_adjust_cwnd(task, copied); if (timer) { if (req->rq_ntrans == 1) rpc_update_rtt(clnt->cl_rtt, timer, @@ -640,27 +643,19 @@ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -/* - * RPC receive timeout handler. - */ -static void -xprt_timer(struct rpc_task *task) +static void xprt_timer(struct rpc_task *task) { - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + dprintk("RPC: %4d xprt_timer\n", task->tk_pid); + spin_lock(&xprt->transport_lock); - if (req->rq_received) - goto out; - - xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); - __xprt_put_cong(xprt, req); - - dprintk("RPC: %4d xprt_timer (%s request)\n", - task->tk_pid, req ? "pending" : "backlogged"); - - task->tk_status = -ETIMEDOUT; -out: + if (!req->rq_received) { + if (xprt->ops->timer) + xprt->ops->timer(task); + task->tk_status = -ETIMEDOUT; + } task->tk_timeout = 0; rpc_wake_up_task(task); spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8589c1ad55e3..c3658ff027a6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -860,6 +860,17 @@ static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) return; } +/** + * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @task: task that timed out + * + * Adjust the congestion window after a retransmit timeout has occurred. + */ +static void xs_udp_timer(struct rpc_task *task) +{ + xprt_adjust_cwnd(task, -ETIMEDOUT); +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -1050,6 +1061,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .timer = xs_udp_timer, .close = xs_close, .destroy = xs_destroy, }; From 1570c1e41eabf6b7031f3e4322a2cf1cbe319fee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:52 -0700 Subject: [PATCH 027/126] [PATCH] RPC: add generic interface for adjusting the congestion window A new interface that allows transports to adjust their congestion window using the Van Jacobson implementation in xprt.c is provided. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +- net/sunrpc/xprt.c | 62 ++++++++++++++----------------------- net/sunrpc/xprtsock.c | 13 +++----- 3 files changed, 31 insertions(+), 47 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 443c3f984cf9..2e48752d55d9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -258,9 +258,10 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_task *task); void xprt_write_space(struct rpc_xprt *xprt); +void xprt_update_rtt(struct rpc_task *task); void xprt_adjust_cwnd(struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); -void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); +void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_disconnect(struct rpc_xprt *xprt); /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ffc595592af3..707806fe1a23 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -592,55 +592,41 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) } /** - * xprt_complete_rqst - called when reply processing is complete - * @xprt: controlling transport - * @req: RPC request that just completed - * @copied: actual number of bytes received from the transport + * xprt_update_rtt - update an RPC client's RTT state after receiving a reply + * @task: RPC request that recently completed * */ -void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) +void xprt_update_rtt(struct rpc_task *task) { - struct rpc_task *task = req->rq_task; - struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; - /* Adjust congestion window */ - if (!xprt->nocong) { - unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(task, copied); - if (timer) { - if (req->rq_ntrans == 1) - rpc_update_rtt(clnt->cl_rtt, timer, - (long)jiffies - req->rq_xtime); - rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1); - } + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(rtt, timer, + (long)jiffies - req->rq_xtime); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } +} -#ifdef RPC_PROFILE - /* Profile only reads for now */ - if (copied > 1024) { - static unsigned long nextstat; - static unsigned long pkt_rtt, pkt_len, pkt_cnt; +/** + * xprt_complete_rqst - called when reply processing is complete + * @task: RPC request that recently completed + * @copied: actual number of bytes received from the transport + * + * Caller holds transport lock. + */ +void xprt_complete_rqst(struct rpc_task *task, int copied) +{ + struct rpc_rqst *req = task->tk_rqstp; - pkt_cnt++; - pkt_len += req->rq_slen + copied; - pkt_rtt += jiffies - req->rq_xtime; - if (time_before(nextstat, jiffies)) { - printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd); - printk("RPC: %ld %ld %ld %ld stat\n", - jiffies, pkt_cnt, pkt_len, pkt_rtt); - pkt_rtt = pkt_len = pkt_cnt = 0; - nextstat = jiffies + 5 * HZ; - } - } -#endif + dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", + task->tk_pid, ntohl(req->rq_xid), copied); - dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied); list_del_init(&req->rq_list); req->rq_received = req->rq_private_buf.len = copied; - - /* ... and wake up the process. */ rpc_wake_up_task(task); - return; } static void xprt_timer(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c3658ff027a6..980f26504f48 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -460,8 +460,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto out_unlock; task = rovr->rq_task; - dprintk("RPC: %4d received reply\n", task->tk_pid); - if ((copied = rovr->rq_private_buf.buflen) > repsize) copied = repsize; @@ -472,7 +470,9 @@ static void xs_udp_data_ready(struct sock *sk, int len) /* Something worked... */ dst_confirm(skb->dst); - xprt_complete_rqst(xprt, rovr, copied); + xprt_adjust_cwnd(task, copied); + xprt_update_rtt(task); + xprt_complete_rqst(task, copied); out_unlock: spin_unlock(&xprt->transport_lock); @@ -634,11 +634,8 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc } out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) + xprt_complete_rqst(req->rq_task, xprt->tcp_copied); spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } From a58dd398f5db4f73d5c581069fd70a4304cc4f0a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:53 -0700 Subject: [PATCH 028/126] [PATCH] RPC: add a release_rqst callout to the RPC transport switch The final place where congestion control state is adjusted is in xprt_release, where each request is finally released. Add a callout there to allow transports to perform additional processing when a request is about to be released. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 14 +++++++++++++- net/sunrpc/xprtsock.c | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 2e48752d55d9..64e77658fa30 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -138,6 +138,7 @@ struct rpc_xprt_ops { int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); void (*timer)(struct rpc_task *task); + void (*release_request)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); }; @@ -262,6 +263,7 @@ void xprt_update_rtt(struct rpc_task *task); void xprt_adjust_cwnd(struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); void xprt_complete_rqst(struct rpc_task *task, int copied); +void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect(struct rpc_xprt *xprt); /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 707806fe1a23..e8d11bd6158e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -289,6 +289,17 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) __xprt_lock_write_next_cong(xprt); } +/** + * xprt_release_rqst_cong - housekeeping when request is complete + * @task: RPC request that recently completed + * + * Useful for transports that require congestion control. + */ +void xprt_release_rqst_cong(struct rpc_task *task) +{ + __xprt_put_cong(task->tk_xprt, task->tk_rqstp); +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @task: recently completed RPC request used to adjust window @@ -823,7 +834,8 @@ void xprt_release(struct rpc_task *task) return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); - __xprt_put_cong(xprt, req); + if (xprt->ops->release_request) + xprt->ops->release_request(task); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 980f26504f48..6c2f5dcea416 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1059,6 +1059,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .timer = xs_udp_timer, + .release_request = xprt_release_rqst_cong, .close = xs_close, .destroy = xs_destroy, }; From ed63c003701a314c4893c11eceb9d68f8f46c662 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:53 -0700 Subject: [PATCH 029/126] [PATCH] RPC: remove xprt->nocong Get rid of the "xprt->nocong" variable. Test-plan: Use WAN simulation to cause sporadic bursty packet loss with UDP mounts. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 1 - include/linux/sunrpc/xprt.h | 1 - net/sunrpc/xprtsock.c | 2 -- 3 files changed, 4 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7901f5b8092c..c4c8601096e0 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -188,7 +188,6 @@ nlm_bind_host(struct nlm_host *host) goto forgetit; xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout); - xprt->nocong = 1; /* No congestion control for NLM */ xprt->resvport = 1; /* NLM requires a reserved port */ /* Existing NLM servers accept AUTH_UNIX only */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 64e77658fa30..559fb471f6f2 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -172,7 +172,6 @@ struct rpc_xprt { unsigned int max_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ - nocong : 1, /* no congestion control */ resvport : 1; /* use a reserved port */ /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6c2f5dcea416..7e5e020fe78d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1100,7 +1100,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ @@ -1140,7 +1139,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; From 555ee3af161b037865793bd4bebc06b58daafde6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:54 -0700 Subject: [PATCH 030/126] [PATCH] RPC: clean up after nocong was removed Clean-up: Move some macros that are specific to the Van Jacobson implementation into xprt.c. Get rid of the cong_wait field in rpc_xprt, which is no longer used. Get rid of xprt_clear_backlog. Test-plan: Compile with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 22 ---------------------- net/sunrpc/xprt.c | 29 +++++++++++++++++++---------- net/sunrpc/xprtsock.c | 2 -- 3 files changed, 19 insertions(+), 34 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 559fb471f6f2..dcf0326bda01 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -15,20 +15,6 @@ #include #include -/* - * The transport code maintains an estimate on the maximum number of out- - * standing RPC requests, using a smoothed version of the congestion - * avoidance implemented in 44BSD. This is basically the Van Jacobson - * congestion algorithm: If a retransmit occurs, the congestion window is - * halved; otherwise, it is incremented by 1/cwnd when - * - * - a reply is received and - * - a full number of requests are outstanding and - * - the congestion window hasn't been updated recently. - * - * Upper procedures may check whether a request would block waiting for - * a free RPC slot by using the RPC_CONGESTED() macro. - */ extern unsigned int xprt_udp_slot_table_entries; extern unsigned int xprt_tcp_slot_table_entries; @@ -36,12 +22,6 @@ extern unsigned int xprt_tcp_slot_table_entries; #define RPC_DEF_SLOT_TABLE (16U) #define RPC_MAX_SLOT_TABLE (128U) -#define RPC_CWNDSHIFT (8U) -#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) -#define RPC_INITCWND RPC_CWNDSCALE -#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) -#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) - /* Default timeout values */ #define RPC_MAX_UDP_TIMEOUT (60*HZ) #define RPC_MAX_TCP_TIMEOUT (600*HZ) @@ -213,8 +193,6 @@ struct rpc_xprt { void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); - - wait_queue_head_t cong_wait; }; #define XPRT_LAST_FRAG (1 << 0) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e8d11bd6158e..0458319a1bdd 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -62,7 +62,23 @@ static inline void do_xprt_reserve(struct rpc_task *); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static int xprt_clear_backlog(struct rpc_xprt *xprt); +/* + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. + */ +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) +#define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) + +#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) /** * xprt_reserve_xprt - serialize write access to transports @@ -850,7 +866,7 @@ void xprt_release(struct rpc_task *task) spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); - xprt_clear_backlog(xprt); + rpc_wake_up_next(&xprt->backlog); spin_unlock(&xprt->reserve_lock); } @@ -902,7 +918,6 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); @@ -911,6 +926,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; + xprt->cwnd = RPC_INITCWND; rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); @@ -955,16 +971,9 @@ static void xprt_shutdown(struct rpc_xprt *xprt) rpc_wake_up(&xprt->resend); xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); - wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); } -static int xprt_clear_backlog(struct rpc_xprt *xprt) { - rpc_wake_up_next(&xprt->backlog); - wake_up(&xprt->cong_wait); - return 1; -} - /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e5e020fe78d..26402c063f00 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1100,7 +1100,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); @@ -1139,7 +1138,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; From 529b33c6db0120126b1381faa51406dc463acdc9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:54 -0700 Subject: [PATCH 031/126] [PATCH] RPC: allow RPC client's port range to be adjustable Select an RPC client source port between 650 and 1023 instead of between 1 and 800. The old range conflicts with a number of network services. Provide sysctls to allow admins to select a different port range. Note that this doesn't affect user-level RPC library behavior, which still uses 1 to 800. Based on a suggestion by Olaf Kirch . Test-plan: Repeated mount and unmount. Destructive testing. Idle timeouts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/debug.h | 2 ++ include/linux/sunrpc/xprt.h | 17 ++++++++++++++--- net/sunrpc/sysctl.c | 29 +++++++++++++++++++++++++++++ net/sunrpc/xprtsock.c | 23 ++++++++--------------- 4 files changed, 53 insertions(+), 18 deletions(-) diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 42d299747956..1a42d902bc11 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -95,6 +95,8 @@ enum { CTL_NLMDEBUG, CTL_SLOTTABLE_UDP, CTL_SLOTTABLE_TCP, + CTL_MIN_RESVPORT, + CTL_MAX_RESVPORT, }; #endif /* _LINUX_SUNRPC_DEBUG_H_ */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index dcf0326bda01..9d9266cf8a36 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -51,6 +51,17 @@ extern unsigned int xprt_tcp_slot_table_entries; #define RPC_CALLHDRSIZE 6 #define RPC_REPHDRSIZE 4 +/* + * Parameters for choosing a free port + */ +extern unsigned int xprt_min_resvport; +extern unsigned int xprt_max_resvport; + +#define RPC_MIN_RESVPORT (1U) +#define RPC_MAX_RESVPORT (65535U) +#define RPC_DEF_MIN_RESVPORT (650U) +#define RPC_DEF_MAX_RESVPORT (1023U) + /* * This describes a timeout strategy */ @@ -62,6 +73,9 @@ struct rpc_timeout { unsigned char to_exponential; }; +struct rpc_task; +struct rpc_xprt; + /* * This describes a complete RPC request */ @@ -107,9 +121,6 @@ struct rpc_rqst { #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len -struct rpc_task; -struct rpc_xprt; - struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); int (*reserve_xprt)(struct rpc_task *task); diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index ef483262f17f..d0c9f460e411 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -121,9 +121,16 @@ proc_dodebug(ctl_table *table, int write, struct file *file, unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; +EXPORT_SYMBOL(xprt_min_resvport); +unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; +EXPORT_SYMBOL(xprt_max_resvport); + static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; +static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; +static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static ctl_table debug_table[] = { { @@ -180,6 +187,28 @@ static ctl_table debug_table[] = { .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, + { + .ctl_name = CTL_MIN_RESVPORT, + .procname = "min_resvport", + .data = &xprt_min_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, + { + .ctl_name = CTL_MAX_RESVPORT, + .procname = "max_resvport", + .data = &xprt_max_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, { .ctl_name = 0 } }; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 26402c063f00..62c2e7caa345 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -35,11 +35,6 @@ #include #include -/* - * Maximum port number to use when requesting a reserved port. - */ -#define XS_MAX_RESVPORT (800U) - /* * How many times to try sending a request on a socket before waiting * for the socket buffer to clear. @@ -873,10 +868,9 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) struct sockaddr_in myaddr = { .sin_family = AF_INET, }; - int err, port; + int err; + unsigned short port = xprt->port; - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; do { myaddr.sin_port = htons(port); err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, @@ -887,8 +881,10 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) port); return 0; } - if (--port == 0) - port = XS_MAX_RESVPORT; + if (port <= xprt_min_resvport) + port = xprt_max_resvport; + else + port--; } while (err == -EADDRINUSE && port != xprt->port); dprintk("RPC: can't bind to reserved port (%d).\n", -err); @@ -1075,9 +1071,6 @@ static struct rpc_xprt_ops xs_tcp_ops = { .destroy = xs_destroy, }; -extern unsigned int xprt_udp_slot_table_entries; -extern unsigned int xprt_tcp_slot_table_entries; - /** * xs_setup_udp - Set up transport to use a UDP socket * @xprt: transport to set up @@ -1098,7 +1091,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_UDP; - xprt->port = XS_MAX_RESVPORT; + xprt->port = xprt_max_resvport; xprt->tsh_size = 0; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ @@ -1136,7 +1129,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_TCP; - xprt->port = XS_MAX_RESVPORT; + xprt->port = xprt_max_resvport; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; From 3167e12c0c424f3c323944701615343022d86418 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:55 -0700 Subject: [PATCH 032/126] [PATCH] RPC: make sure to get the same local port number when reconnecting Implement a best practice: if the remote end drops our connection, try to reconnect using the same port number. This is important because the NFS server's Duplicate Reply Cache often hashes on the source port number. If the client reuses the port number when it reconnects, the server's DRC will be more effective. Based on suggestions by Mike Eisler, Olaf Kirch, and Alexey Kuznetsky. Test-plan: Destructive testing. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 65 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 62c2e7caa345..88ac71fcd335 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -362,6 +362,8 @@ static int xs_tcp_send_request(struct rpc_task *task) * xs_close - close a socket * @xprt: transport * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. */ static void xs_close(struct rpc_xprt *xprt) { @@ -949,6 +951,30 @@ static void xs_udp_connect_worker(void *args) xprt_clear_connecting(xprt); } +/* + * We need to preserve the port number so the reply cache on the server can + * find our cached RPC replies when we get around to reconnecting. + */ +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) +{ + int result; + struct socket *sock = xprt->sock; + struct sockaddr any; + + dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); + + /* + * Disconnect the transport socket by doing a connect operation + * with AF_UNSPEC. This should return immediately... + */ + memset(&any, 0, sizeof(any)); + any.sa_family = AF_UNSPEC; + result = sock->ops->connect(sock, &any, sizeof(any), 0); + if (result) + dprintk("RPC: AF_UNSPEC connect return code %d\n", + result); +} + /** * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint * @args: RPC transport to connect @@ -966,18 +992,20 @@ static void xs_tcp_connect_worker(void *args) dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt); - /* Start by resetting any existing socket state */ - xs_close(xprt); + if (!xprt->sock) { + /* start from scratch */ + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } - if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", -err); - goto out; - } - - if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { - sock_release(sock); - goto out; - } + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + } else + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(xprt); if (!xprt->inet) { struct sock *sk = sock->sk; @@ -991,7 +1019,12 @@ static void xs_tcp_connect_worker(void *args) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - tcp_sk(sk)->nonagle = 1; + + /* socket options */ + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + sock_reset_flag(sk, SOCK_LINGER); + tcp_sk(sk)->linger2 = 0; + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); @@ -1012,6 +1045,14 @@ static void xs_tcp_connect_worker(void *args) case -EINPROGRESS: case -EALREADY: goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_close(xprt); + break; } } out: From 03bf4b707eee06706c9db343dd5c905b7ee47ed2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:55 -0700 Subject: [PATCH 033/126] [PATCH] RPC: parametrize various transport connect timeouts Each transport implementation can now set unique bind, connect, reestablishment, and idle timeout values. These are variables, allowing the values to be modified dynamically. This permits exponential backoff of any of these values, for instance. As an example, we implement exponential backoff for the connection reestablishment timeout. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 10 +++--- include/linux/nfs_fs.h | 4 +++ include/linux/sunrpc/xprt.h | 29 ++++------------ net/sunrpc/clnt.c | 2 +- net/sunrpc/xprt.c | 5 ++- net/sunrpc/xprtsock.c | 68 +++++++++++++++++++++++++++++++++++-- 6 files changed, 84 insertions(+), 34 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b6a1ca508e60..062911e7ceb5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -369,8 +369,8 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned case IPPROTO_TCP: if (!to->to_initval) to->to_initval = 60 * HZ; - if (to->to_initval > RPC_MAX_TCP_TIMEOUT) - to->to_initval = RPC_MAX_TCP_TIMEOUT; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); to->to_exponential = 0; @@ -379,9 +379,9 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned default: if (!to->to_initval) to->to_initval = 11 * HZ / 10; - if (to->to_initval > RPC_MAX_UDP_TIMEOUT) - to->to_initval = RPC_MAX_UDP_TIMEOUT; - to->to_maxval = RPC_MAX_UDP_TIMEOUT; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 9a6047ff1b25..7bac2785c6e4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -41,6 +41,10 @@ #define NFS_MAX_FILE_IO_BUFFER_SIZE 32768 #define NFS_DEF_FILE_IO_BUFFER_SIZE 4096 +/* Default timeout values */ +#define NFS_MAX_UDP_TIMEOUT (60*HZ) +#define NFS_MAX_TCP_TIMEOUT (600*HZ) + /* * superblock magic number for NFS */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 9d9266cf8a36..2543adf18551 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -22,28 +22,6 @@ extern unsigned int xprt_tcp_slot_table_entries; #define RPC_DEF_SLOT_TABLE (16U) #define RPC_MAX_SLOT_TABLE (128U) -/* Default timeout values */ -#define RPC_MAX_UDP_TIMEOUT (60*HZ) -#define RPC_MAX_TCP_TIMEOUT (600*HZ) - -/* - * Wait duration for an RPC TCP connection to be established. Solaris - * NFS over TCP uses 60 seconds, for example, which is in line with how - * long a server takes to reboot. - */ -#define RPC_CONNECT_TIMEOUT (60*HZ) - -/* - * Delay an arbitrary number of seconds before attempting to reconnect - * after an error. - */ -#define RPC_REESTABLISH_TIMEOUT (15*HZ) - -/* - * RPC transport idle timeout. - */ -#define RPC_IDLE_DISCONNECT_TIMEOUT (5*60*HZ) - /* * RPC call and reply header size as number of 32bit words (verifier * size computed separately) @@ -182,14 +160,19 @@ struct rpc_xprt { /* * Connection of transports */ + unsigned long connect_timeout, + bind_timeout, + reestablish_timeout; struct work_struct connect_worker; unsigned short port; + /* * Disconnection of idle transports */ struct work_struct task_cleanup; struct timer_list timer; - unsigned long last_used; + unsigned long last_used, + idle_timeout; /* * Send stuff diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index cc1b773a79d3..24b44e73f391 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -740,7 +740,7 @@ call_bind(struct rpc_task *task) task->tk_action = call_connect; if (!clnt->cl_port) { task->tk_action = call_bind_status; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_timeout = task->tk_xprt->bind_timeout; rpc_getport(task, clnt); } } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0458319a1bdd..215be0d0ef6b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -551,7 +551,7 @@ void xprt_connect(struct rpc_task *task) if (task->tk_rqstp) task->tk_rqstp->rq_bytes_sent = 0; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_timeout = xprt->connect_timeout; rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); xprt->ops->connect(task); } @@ -763,7 +763,6 @@ void xprt_transmit(struct rpc_task *task) switch (status) { case -ECONNREFUSED: - task->tk_timeout = RPC_REESTABLISH_TIMEOUT; rpc_sleep_on(&xprt->sending, task, NULL, NULL); case -EAGAIN: case -ENOTCONN: @@ -857,7 +856,7 @@ void xprt_release(struct rpc_task *task) xprt->last_used = jiffies; if (list_empty(&xprt->recv) && !xprt->shutdown) mod_timer(&xprt->timer, - xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); + xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 88ac71fcd335..06c2d95484e0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -41,6 +41,50 @@ */ #define XS_SENDMSG_RETRY (10U) +/* + * Time out for an RPC UDP socket connect. UDP socket connects are + * synchronous, but we set a timeout anyway in case of resource + * exhaustion on the local host. + */ +#define XS_UDP_CONN_TO (5U * HZ) + +/* + * Wait duration for an RPC TCP connection to be established. Solaris + * NFS over TCP uses 60 seconds, for example, which is in line with how + * long a server takes to reboot. + */ +#define XS_TCP_CONN_TO (60U * HZ) + +/* + * Wait duration for a reply from the RPC portmapper. + */ +#define XS_BIND_TO (60U * HZ) + +/* + * Delay if a UDP socket connect error occurs. This is most likely some + * kind of resource problem on the local host. + */ +#define XS_UDP_REEST_TO (2U * HZ) + +/* + * The reestablish timeout allows clients to delay for a bit before attempting + * to reconnect to a server that just dropped our connection. + * + * We implement an exponential backoff when trying to reestablish a TCP + * transport connection with the server. Some servers like to drop a TCP + * connection when they are overworked, so we start with a short timeout and + * increase over time if the server is down or not responding. + */ +#define XS_TCP_INIT_REEST_TO (3U * HZ) +#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) + +/* + * TCP idle timeout; client drops the transport socket if it is idle + * for this long. Note that we also timeout UDP sockets to prevent + * holding port numbers when there is no RPC traffic. + */ +#define XS_IDLE_DISC_TO (5U * 60 * HZ) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS @@ -739,6 +783,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); @@ -1066,6 +1111,13 @@ static void xs_tcp_connect_worker(void *args) * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. + * + * UDP socket connects are synchronous, but we use a work queue anyway + * to guarantee that even unprivileged user processes can set up a + * socket on a privileged port. + * + * If a UDP socket connect fails, the delay behavior here prevents + * retry floods (hard mounts). */ static void xs_connect(struct rpc_task *task) { @@ -1075,9 +1127,13 @@ static void xs_connect(struct rpc_task *task) return; if (xprt->sock != NULL) { - dprintk("RPC: xs_connect delayed xprt %p\n", xprt); + dprintk("RPC: xs_connect delayed xprt %p for %lu seconds\n", + xprt, xprt->reestablish_timeout / HZ); schedule_delayed_work(&xprt->connect_worker, - RPC_REESTABLISH_TIMEOUT); + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) + xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); schedule_work(&xprt->connect_worker); @@ -1139,6 +1195,10 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_udp_ops; @@ -1176,6 +1236,10 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; From 470056c288334eb0b37be26c9ff8aee37ed1cc7a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:56 -0700 Subject: [PATCH 034/126] [PATCH] RPC: rationalize set_buffer_size In fact, ->set_buffer_size should be completely functionless for non-UDP. Test-plan: Check socket buffer size on UDP sockets over time. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 4 ++-- net/sunrpc/clnt.c | 10 ++-------- net/sunrpc/xprtsock.c | 30 +++++++++++++++--------------- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 2543adf18551..99cad3ead81d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -100,7 +100,7 @@ struct rpc_rqst { #define rq_slen rq_snd_buf.len struct rpc_xprt_ops { - void (*set_buffer_size)(struct rpc_xprt *xprt); + void (*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize); int (*reserve_xprt)(struct rpc_task *task); void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*connect)(struct rpc_task *task); @@ -124,7 +124,7 @@ struct rpc_xprt { unsigned long cong; /* current congestion */ unsigned long cwnd; /* congestion window */ - unsigned int rcvsize, /* transport rcv buffer size */ + size_t rcvsize, /* transport rcv buffer size */ sndsize; /* transport send buffer size */ size_t max_payload; /* largest RPC payload size, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 24b44e73f391..5a8f01d726e9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -517,14 +517,8 @@ void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { struct rpc_xprt *xprt = clnt->cl_xprt; - - xprt->sndsize = 0; - if (sndsize) - xprt->sndsize = sndsize + RPC_SLACK_SPACE; - xprt->rcvsize = 0; - if (rcvsize) - xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; - xprt->ops->set_buffer_size(xprt); + if (xprt->ops->set_buffer_size) + xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 06c2d95484e0..2e1529217e65 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -865,15 +865,7 @@ static void xs_tcp_write_space(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/** - * xs_udp_set_buffer_size - set send and receive limits - * @xprt: generic transport - * - * Set socket send and receive limits based on the - * sndsize and rcvsize fields in the generic transport - * structure. - */ -static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; @@ -889,14 +881,23 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) } /** - * xs_tcp_set_buffer_size - set send and receive limits + * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport + * @sndsize: requested size of send buffer, in bytes + * @rcvsize: requested size of receive buffer, in bytes * - * Nothing to do for TCP. + * Set socket send and receive buffer size limits. */ -static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) { - return; + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + 1024; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + 1024; + + xs_udp_do_set_buffer_size(xprt); } /** @@ -989,7 +990,7 @@ static void xs_udp_connect_worker(void *args) write_unlock_bh(&sk->sk_callback_lock); } - xs_udp_set_buffer_size(xprt); + xs_udp_do_set_buffer_size(xprt); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1158,7 +1159,6 @@ static struct rpc_xprt_ops xs_udp_ops = { }; static struct rpc_xprt_ops xs_tcp_ops = { - .set_buffer_size = xs_tcp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .connect = xs_connect, From 278c995c8a153bb2a9bc427e931cfb9c8034c9d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jul 2005 23:53:01 +0100 Subject: [PATCH 035/126] [PATCH] RPC,NFS: new rpc_pipefs patch Currently rpc_mkdir/rpc_rmdir and rpc_mkpipe/mk_unlink have an API that's a little unfortunate. They take a path relative to the rpc_pipefs root and thus need to perform a full lookup. If you look at debugfs or usbfs they always store the dentry for directories they created and thus can pass in a dentry + single pathname component pair into their equivalents of the above functions. And in fact rpc_pipefs actually stores a dentry for all but one component so this change not only simplifies the core rpc_pipe code but also the callers. Unfortuntately this code path is only used by the NFS4 idmapper and AUTH_GSSAPI for which I don't have a test enviroment. Could someone give it a spin? It's the last bit needed before we can rework the lookup_hash API Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 10 +- include/linux/sunrpc/clnt.h | 2 +- include/linux/sunrpc/rpc_pipe_fs.h | 9 +- net/sunrpc/auth_gss/auth_gss.c | 9 +- net/sunrpc/clnt.c | 55 ++++-- net/sunrpc/rpc_pipe.c | 280 +++++++++++------------------ 6 files changed, 149 insertions(+), 216 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index ffb8df91dc34..1d0a5bf0d264 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -66,7 +66,6 @@ struct idmap_hashtable { }; struct idmap { - char idmap_path[48]; struct dentry *idmap_dentry; wait_queue_head_t idmap_wq; struct idmap_msg idmap_im; @@ -102,11 +101,8 @@ nfs_idmap_new(struct nfs4_client *clp) memset(idmap, 0, sizeof(*idmap)); - snprintf(idmap->idmap_path, sizeof(idmap->idmap_path), - "%s/idmap", clp->cl_rpcclient->cl_pathname); - - idmap->idmap_dentry = rpc_mkpipe(idmap->idmap_path, - idmap, &idmap_upcall_ops, 0); + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, + "idmap", idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { kfree(idmap); return; @@ -128,7 +124,7 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; - rpc_unlink(idmap->idmap_path); + rpc_unlink(idmap->idmap_dentry); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ab151bbb66df..b5b51c196690 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -59,7 +59,7 @@ struct rpc_clnt { int cl_nodelen; /* nodename length */ char cl_nodename[UNX_MAXNODENAME]; - char cl_pathname[30];/* Path in rpc_pipe_fs */ + struct dentry * __cl_parent_dentry; struct dentry * cl_dentry; /* inode */ struct rpc_clnt * cl_parent; /* Points to parent of clones */ struct rpc_rtt cl_rtt_default; diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 63929349571f..63878d05c9a9 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -41,10 +41,11 @@ RPC_I(struct inode *inode) extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); -extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *); -extern int rpc_rmdir(char *); -extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags); -extern int rpc_unlink(char *); +extern struct dentry *rpc_mkdir(struct dentry *, char *, struct rpc_clnt *); +extern void rpc_rmdir(struct dentry *); +extern struct dentry *rpc_mkpipe(struct dentry *, char *, void *, + struct rpc_pipe_ops *, int flags); +extern void rpc_unlink(struct dentry *); #endif #endif diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d2b08f16c257..bd2555139fa9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -87,7 +87,6 @@ struct gss_auth { struct list_head upcalls; struct rpc_clnt *client; struct dentry *dentry; - char path[48]; spinlock_t lock; }; @@ -690,10 +689,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) if (err) goto err_put_mech; - snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", - clnt->cl_pathname, - gss_auth->mech->gm_name); - gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + gss_auth->dentry = rpc_mkpipe(clnt->cl_dentry, gss_auth->mech->gm_name, + clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(gss_auth->dentry)) { err = PTR_ERR(gss_auth->dentry); goto err_put_mech; @@ -718,7 +715,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->path); + rpc_unlink(gss_auth->dentry); gss_mech_put(gss_auth->mech); rpcauth_free_credcache(auth); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5a8f01d726e9..63bf591310e0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -67,26 +67,42 @@ static u32 * call_verify(struct rpc_task *task); static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { - static uint32_t clntid; + static unsigned int clntid; + char name[128]; int error; if (dir_name == NULL) return 0; - for (;;) { - snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname), - "%s/clnt%x", dir_name, - (unsigned int)clntid++); - clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0'; - clnt->cl_dentry = rpc_mkdir(clnt->cl_pathname, clnt); - if (!IS_ERR(clnt->cl_dentry)) - return 0; - error = PTR_ERR(clnt->cl_dentry); - if (error != -EEXIST) { - printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", - clnt->cl_pathname, error); - return error; - } + + retry_parent: + clnt->__cl_parent_dentry = rpc_mkdir(NULL, dir_name, NULL); + if (IS_ERR(clnt->__cl_parent_dentry)) { + error = PTR_ERR(clnt->__cl_parent_dentry); + if (error == -EEXIST) + goto retry_parent; /* XXX(hch): WTF? */ + + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + dir_name, error); + return error; } + + + retry_child: + snprintf(name, sizeof(name), "clnt%x", clntid++); + name[sizeof(name) - 1] = '\0'; + + clnt->cl_dentry = rpc_mkdir(clnt->__cl_parent_dentry, name, clnt); + if (IS_ERR(clnt->cl_dentry)) { + error = PTR_ERR(clnt->cl_dentry); + if (error == -EEXIST) + goto retry_child; + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + name, error); + rpc_rmdir(clnt->__cl_parent_dentry); + return error; + } + + return 0; } /* @@ -174,7 +190,8 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, return clnt; out_no_auth: - rpc_rmdir(clnt->cl_pathname); + rpc_rmdir(clnt->cl_dentry); + rpc_rmdir(clnt->__cl_parent_dentry); out_no_path: if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); @@ -302,8 +319,10 @@ rpc_destroy_client(struct rpc_clnt *clnt) rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (clnt->cl_pathname[0]) - rpc_rmdir(clnt->cl_pathname); + if (clnt->cl_dentry) + rpc_rmdir(clnt->cl_dentry); + if (clnt->__cl_parent_dentry) + rpc_rmdir(clnt->__cl_parent_dentry); if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11ec..b382809726d8 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -414,38 +414,6 @@ rpc_put_mount(void) simple_release_fs(&rpc_mount, &rpc_mount_count); } -static int -rpc_lookup_parent(char *path, struct nameidata *nd) -{ - if (path[0] == '\0') - return -ENOENT; - if (rpc_get_mount()) { - printk(KERN_WARNING "%s: %s failed to mount " - "pseudofilesystem \n", __FILE__, __FUNCTION__); - return -ENODEV; - } - nd->mnt = mntget(rpc_mount); - nd->dentry = dget(rpc_mount->mnt_root); - nd->last_type = LAST_ROOT; - nd->flags = LOOKUP_PARENT; - nd->depth = 0; - - if (path_walk(path, nd)) { - printk(KERN_WARNING "%s: %s failed to find path %s\n", - __FILE__, __FUNCTION__, path); - rpc_put_mount(); - return -ENOENT; - } - return 0; -} - -static void -rpc_release_path(struct nameidata *nd) -{ - path_release(nd); - rpc_put_mount(); -} - static struct inode * rpc_get_inode(struct super_block *sb, int mode) { @@ -550,197 +518,149 @@ rpc_populate(struct dentry *parent, return -ENOMEM; } -static int -__rpc_mkdir(struct inode *dir, struct dentry *dentry) +struct dentry * +rpc_mkdir(struct dentry *parent, char *name, struct rpc_clnt *rpc_client) { + struct inode *dir; + struct dentry *dentry; struct inode *inode; + int error; + + if (!parent) + parent = rpc_mount->mnt_root; + + dir = parent->d_inode; + + error = rpc_get_mount(); + if (error) + return ERR_PTR(error); + + down(&dir->i_sem); + dentry = lookup_one_len(name, parent, strlen(name)); + if (IS_ERR(dentry)) + goto out_unlock; + if (dentry->d_inode) { + dentry = ERR_PTR(-EEXIST); + goto out_dput; + } inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); if (!inode) - goto out_err; + goto out_dput; inode->i_ino = iunique(dir->i_sb, 100); - d_instantiate(dentry, inode); dir->i_nlink++; - inode_dir_notify(dir, DN_CREATE); - rpc_get_mount(); - return 0; -out_err: - printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", - __FILE__, __FUNCTION__, dentry->d_name.name); - return -ENOMEM; -} - -static int -__rpc_rmdir(struct inode *dir, struct dentry *dentry) -{ - int error; - - shrink_dcache_parent(dentry); - if (dentry->d_inode) { - rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); - } - if ((error = simple_rmdir(dir, dentry)) != 0) - return error; - if (!error) { - inode_dir_notify(dir, DN_DELETE); - d_drop(dentry); - rpc_put_mount(); - } - return 0; -} - -static struct dentry * -rpc_lookup_negative(char *path, struct nameidata *nd) -{ - struct dentry *dentry; - struct inode *dir; - int error; - - if ((error = rpc_lookup_parent(path, nd)) != 0) - return ERR_PTR(error); - dir = nd->dentry->d_inode; - down(&dir->i_sem); - dentry = lookup_hash(&nd->last, nd->dentry); - if (IS_ERR(dentry)) - goto out_err; - if (dentry->d_inode) { - dput(dentry); - dentry = ERR_PTR(-EEXIST); - goto out_err; - } - return dentry; -out_err: - up(&dir->i_sem); - rpc_release_path(nd); - return dentry; -} - - -struct dentry * -rpc_mkdir(char *path, struct rpc_clnt *rpc_client) -{ - struct nameidata nd; - struct dentry *dentry; - struct inode *dir; - int error; - - dentry = rpc_lookup_negative(path, &nd); - if (IS_ERR(dentry)) - return dentry; - dir = nd.dentry->d_inode; - if ((error = __rpc_mkdir(dir, dentry)) != 0) - goto err_dput; RPC_I(dentry->d_inode)->private = rpc_client; + + d_instantiate(dentry, inode); + dget(dentry); + up(&dir->i_sem); + + inode_dir_notify(dir, DN_CREATE); + error = rpc_populate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); if (error) - goto err_depopulate; -out: - up(&dir->i_sem); - rpc_release_path(&nd); + goto out_depopulate; + return dentry; -err_depopulate: - rpc_depopulate(dentry); - __rpc_rmdir(dir, dentry); -err_dput: + + out_depopulate: + rpc_rmdir(dentry); + out_dput: dput(dentry); - printk(KERN_WARNING "%s: %s() failed to create directory %s (errno = %d)\n", - __FILE__, __FUNCTION__, path, error); - dentry = ERR_PTR(error); - goto out; + out_unlock: + up(&dir->i_sem); + rpc_put_mount(); + return dentry; } -int -rpc_rmdir(char *path) +void +rpc_rmdir(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; - struct inode *dir; - int error; + struct dentry *parent = dentry->d_parent; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; - down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } rpc_depopulate(dentry); - error = __rpc_rmdir(dir, dentry); - dput(dentry); -out_release: - up(&dir->i_sem); - rpc_release_path(&nd); - return error; + + down(&parent->d_inode->i_sem); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + simple_rmdir(parent->d_inode, dentry); + } + up(&parent->d_inode->i_sem); + + inode_dir_notify(parent->d_inode, DN_DELETE); + rpc_put_mount(); } struct dentry * -rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) +rpc_mkpipe(struct dentry *parent, char *name, void *private, + struct rpc_pipe_ops *ops, int flags) { - struct nameidata nd; + struct inode *dir = parent->d_inode; struct dentry *dentry; - struct inode *dir, *inode; + struct inode *inode; struct rpc_inode *rpci; + int error; - dentry = rpc_lookup_negative(path, &nd); + error = rpc_get_mount(); + if (error) + return ERR_PTR(error); + + down(&parent->d_inode->i_sem); + dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) - return dentry; - dir = nd.dentry->d_inode; - inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR); - if (!inode) - goto err_dput; + goto out_unlock; + if (dentry->d_inode) { + dentry = ERR_PTR(-EEXIST); + goto out_dput; + } + + inode = rpc_get_inode(parent->d_inode->i_sb, + S_IFSOCK | S_IRUSR | S_IWUSR); + if (!inode) { + dentry = ERR_PTR(-ENOMEM); + goto out_dput; + } + inode->i_ino = iunique(dir->i_sb, 100); inode->i_fop = &rpc_pipe_fops; - d_instantiate(dentry, inode); + rpci = RPC_I(inode); rpci->private = private; rpci->flags = flags; rpci->ops = ops; + + d_instantiate(dentry, inode); + dget(dentry); + up(&parent->d_inode->i_sem); + inode_dir_notify(dir, DN_CREATE); -out: - up(&dir->i_sem); - rpc_release_path(&nd); return dentry; -err_dput: + + out_dput: dput(dentry); - dentry = ERR_PTR(-ENOMEM); - printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n", - __FILE__, __FUNCTION__, path, -ENOMEM); - goto out; + out_unlock: + up(&parent->d_inode->i_sem); + rpc_put_mount(); + return dentry; } -int -rpc_unlink(char *path) +void +rpc_unlink(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; - struct inode *dir; - int error; + struct dentry *parent = dentry->d_parent; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; - down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } - d_drop(dentry); + down(&parent->d_inode->i_sem); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); rpc_inode_setowner(dentry->d_inode, NULL); - error = simple_unlink(dir, dentry); + simple_unlink(parent->d_inode, dentry); } - dput(dentry); - inode_dir_notify(dir, DN_DELETE); -out_release: - up(&dir->i_sem); - rpc_release_path(&nd); - return error; + up(&parent->d_inode->i_sem); + + inode_dir_notify(parent->d_inode, DN_DELETE); + rpc_put_mount(); } /* From 3063d8a16643190f9e12e9c7e9f1ca56f7e7934e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Aug 2005 16:25:57 -0700 Subject: [PATCH 036/126] NFS: Make /proc/mounts display the protocol used by NFSv4 Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 062911e7ceb5..358d8ef8c087 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -595,7 +595,6 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) { NFS_MOUNT_SOFT, ",soft", ",hard" }, { NFS_MOUNT_INTR, ",intr", "" }, { NFS_MOUNT_POSIX, ",posix", "" }, - { NFS_MOUNT_TCP, ",tcp", ",udp" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", ",lock" }, @@ -604,6 +603,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) }; struct proc_nfs_info *nfs_infop; struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + char buf[12]; + char *proto; seq_printf(m, ",v%d", nfss->rpc_ops->version); seq_printf(m, ",rsize=%d", nfss->rsize); @@ -622,6 +623,18 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) else seq_puts(m, nfs_infop->nostr); } + switch (nfss->client->cl_xprt->prot) { + case IPPROTO_TCP: + proto = "tcp"; + break; + case IPPROTO_UDP: + proto = "udp"; + break; + default: + snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot); + proto = buf; + } + seq_printf(m, ",proto=%s", proto); seq_puts(m, ",addr="); seq_escape(m, nfss->hostname, " \t\n\\"); return 0; From f134585a7343d71f9be7f0cf97e2145f21dd10c6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Sep 2005 11:08:25 -0400 Subject: [PATCH 037/126] Revert "[PATCH] RPC,NFS: new rpc_pipefs patch" This reverts 17f4e6febca160a9f9dd4bdece9784577a2f4524 commit. --- fs/nfs/idmap.c | 10 +- include/linux/sunrpc/clnt.h | 2 +- include/linux/sunrpc/rpc_pipe_fs.h | 9 +- net/sunrpc/auth_gss/auth_gss.c | 9 +- net/sunrpc/clnt.c | 53 ++---- net/sunrpc/rpc_pipe.c | 282 ++++++++++++++++++----------- 6 files changed, 216 insertions(+), 149 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 1d0a5bf0d264..ffb8df91dc34 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -66,6 +66,7 @@ struct idmap_hashtable { }; struct idmap { + char idmap_path[48]; struct dentry *idmap_dentry; wait_queue_head_t idmap_wq; struct idmap_msg idmap_im; @@ -101,8 +102,11 @@ nfs_idmap_new(struct nfs4_client *clp) memset(idmap, 0, sizeof(*idmap)); - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, - "idmap", idmap, &idmap_upcall_ops, 0); + snprintf(idmap->idmap_path, sizeof(idmap->idmap_path), + "%s/idmap", clp->cl_rpcclient->cl_pathname); + + idmap->idmap_dentry = rpc_mkpipe(idmap->idmap_path, + idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { kfree(idmap); return; @@ -124,7 +128,7 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; - rpc_unlink(idmap->idmap_dentry); + rpc_unlink(idmap->idmap_path); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b5b51c196690..ab151bbb66df 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -59,7 +59,7 @@ struct rpc_clnt { int cl_nodelen; /* nodename length */ char cl_nodename[UNX_MAXNODENAME]; - struct dentry * __cl_parent_dentry; + char cl_pathname[30];/* Path in rpc_pipe_fs */ struct dentry * cl_dentry; /* inode */ struct rpc_clnt * cl_parent; /* Points to parent of clones */ struct rpc_rtt cl_rtt_default; diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 63878d05c9a9..63929349571f 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -41,11 +41,10 @@ RPC_I(struct inode *inode) extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); -extern struct dentry *rpc_mkdir(struct dentry *, char *, struct rpc_clnt *); -extern void rpc_rmdir(struct dentry *); -extern struct dentry *rpc_mkpipe(struct dentry *, char *, void *, - struct rpc_pipe_ops *, int flags); -extern void rpc_unlink(struct dentry *); +extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *); +extern int rpc_rmdir(char *); +extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags); +extern int rpc_unlink(char *); #endif #endif diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index bd2555139fa9..d2b08f16c257 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -87,6 +87,7 @@ struct gss_auth { struct list_head upcalls; struct rpc_clnt *client; struct dentry *dentry; + char path[48]; spinlock_t lock; }; @@ -689,8 +690,10 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) if (err) goto err_put_mech; - gss_auth->dentry = rpc_mkpipe(clnt->cl_dentry, gss_auth->mech->gm_name, - clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", + clnt->cl_pathname, + gss_auth->mech->gm_name); + gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(gss_auth->dentry)) { err = PTR_ERR(gss_auth->dentry); goto err_put_mech; @@ -715,7 +718,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->dentry); + rpc_unlink(gss_auth->path); gss_mech_put(gss_auth->mech); rpcauth_free_credcache(auth); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 63bf591310e0..5a8f01d726e9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -67,42 +67,26 @@ static u32 * call_verify(struct rpc_task *task); static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { - static unsigned int clntid; - char name[128]; + static uint32_t clntid; int error; if (dir_name == NULL) return 0; - - retry_parent: - clnt->__cl_parent_dentry = rpc_mkdir(NULL, dir_name, NULL); - if (IS_ERR(clnt->__cl_parent_dentry)) { - error = PTR_ERR(clnt->__cl_parent_dentry); - if (error == -EEXIST) - goto retry_parent; /* XXX(hch): WTF? */ - - printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", - dir_name, error); - return error; - } - - - retry_child: - snprintf(name, sizeof(name), "clnt%x", clntid++); - name[sizeof(name) - 1] = '\0'; - - clnt->cl_dentry = rpc_mkdir(clnt->__cl_parent_dentry, name, clnt); - if (IS_ERR(clnt->cl_dentry)) { + for (;;) { + snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname), + "%s/clnt%x", dir_name, + (unsigned int)clntid++); + clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0'; + clnt->cl_dentry = rpc_mkdir(clnt->cl_pathname, clnt); + if (!IS_ERR(clnt->cl_dentry)) + return 0; error = PTR_ERR(clnt->cl_dentry); - if (error == -EEXIST) - goto retry_child; - printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", - name, error); - rpc_rmdir(clnt->__cl_parent_dentry); - return error; + if (error != -EEXIST) { + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + clnt->cl_pathname, error); + return error; + } } - - return 0; } /* @@ -190,8 +174,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, return clnt; out_no_auth: - rpc_rmdir(clnt->cl_dentry); - rpc_rmdir(clnt->__cl_parent_dentry); + rpc_rmdir(clnt->cl_pathname); out_no_path: if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); @@ -319,10 +302,8 @@ rpc_destroy_client(struct rpc_clnt *clnt) rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (clnt->cl_dentry) - rpc_rmdir(clnt->cl_dentry); - if (clnt->__cl_parent_dentry) - rpc_rmdir(clnt->__cl_parent_dentry); + if (clnt->cl_pathname[0]) + rpc_rmdir(clnt->cl_pathname); if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index b382809726d8..ded6c63f11ec 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -414,6 +414,38 @@ rpc_put_mount(void) simple_release_fs(&rpc_mount, &rpc_mount_count); } +static int +rpc_lookup_parent(char *path, struct nameidata *nd) +{ + if (path[0] == '\0') + return -ENOENT; + if (rpc_get_mount()) { + printk(KERN_WARNING "%s: %s failed to mount " + "pseudofilesystem \n", __FILE__, __FUNCTION__); + return -ENODEV; + } + nd->mnt = mntget(rpc_mount); + nd->dentry = dget(rpc_mount->mnt_root); + nd->last_type = LAST_ROOT; + nd->flags = LOOKUP_PARENT; + nd->depth = 0; + + if (path_walk(path, nd)) { + printk(KERN_WARNING "%s: %s failed to find path %s\n", + __FILE__, __FUNCTION__, path); + rpc_put_mount(); + return -ENOENT; + } + return 0; +} + +static void +rpc_release_path(struct nameidata *nd) +{ + path_release(nd); + rpc_put_mount(); +} + static struct inode * rpc_get_inode(struct super_block *sb, int mode) { @@ -518,149 +550,197 @@ rpc_populate(struct dentry *parent, return -ENOMEM; } -struct dentry * -rpc_mkdir(struct dentry *parent, char *name, struct rpc_clnt *rpc_client) +static int +__rpc_mkdir(struct inode *dir, struct dentry *dentry) { - struct inode *dir; - struct dentry *dentry; struct inode *inode; - int error; - - if (!parent) - parent = rpc_mount->mnt_root; - - dir = parent->d_inode; - - error = rpc_get_mount(); - if (error) - return ERR_PTR(error); - - down(&dir->i_sem); - dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) - goto out_unlock; - if (dentry->d_inode) { - dentry = ERR_PTR(-EEXIST); - goto out_dput; - } inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); if (!inode) - goto out_dput; + goto out_err; inode->i_ino = iunique(dir->i_sb, 100); - dir->i_nlink++; - RPC_I(dentry->d_inode)->private = rpc_client; - d_instantiate(dentry, inode); - dget(dentry); - up(&dir->i_sem); - + dir->i_nlink++; inode_dir_notify(dir, DN_CREATE); - - error = rpc_populate(dentry, authfiles, - RPCAUTH_info, RPCAUTH_EOF); - if (error) - goto out_depopulate; - - return dentry; - - out_depopulate: - rpc_rmdir(dentry); - out_dput: - dput(dentry); - out_unlock: - up(&dir->i_sem); - rpc_put_mount(); - return dentry; + rpc_get_mount(); + return 0; +out_err: + printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", + __FILE__, __FUNCTION__, dentry->d_name.name); + return -ENOMEM; } -void -rpc_rmdir(struct dentry *dentry) +static int +__rpc_rmdir(struct inode *dir, struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; + int error; - rpc_depopulate(dentry); - - down(&parent->d_inode->i_sem); + shrink_dcache_parent(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); rpc_inode_setowner(dentry->d_inode, NULL); - simple_rmdir(parent->d_inode, dentry); } - up(&parent->d_inode->i_sem); + if ((error = simple_rmdir(dir, dentry)) != 0) + return error; + if (!error) { + inode_dir_notify(dir, DN_DELETE); + d_drop(dentry); + rpc_put_mount(); + } + return 0; +} - inode_dir_notify(parent->d_inode, DN_DELETE); - rpc_put_mount(); +static struct dentry * +rpc_lookup_negative(char *path, struct nameidata *nd) +{ + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, nd)) != 0) + return ERR_PTR(error); + dir = nd->dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd->last, nd->dentry); + if (IS_ERR(dentry)) + goto out_err; + if (dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + goto out_err; + } + return dentry; +out_err: + up(&dir->i_sem); + rpc_release_path(nd); + return dentry; +} + + +struct dentry * +rpc_mkdir(char *path, struct rpc_clnt *rpc_client) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + dentry = rpc_lookup_negative(path, &nd); + if (IS_ERR(dentry)) + return dentry; + dir = nd.dentry->d_inode; + if ((error = __rpc_mkdir(dir, dentry)) != 0) + goto err_dput; + RPC_I(dentry->d_inode)->private = rpc_client; + error = rpc_populate(dentry, authfiles, + RPCAUTH_info, RPCAUTH_EOF); + if (error) + goto err_depopulate; +out: + up(&dir->i_sem); + rpc_release_path(&nd); + return dentry; +err_depopulate: + rpc_depopulate(dentry); + __rpc_rmdir(dir, dentry); +err_dput: + dput(dentry); + printk(KERN_WARNING "%s: %s() failed to create directory %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, error); + dentry = ERR_PTR(error); + goto out; +} + +int +rpc_rmdir(char *path) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + rpc_depopulate(dentry); + error = __rpc_rmdir(dir, dentry); + dput(dentry); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; } struct dentry * -rpc_mkpipe(struct dentry *parent, char *name, void *private, - struct rpc_pipe_ops *ops, int flags) +rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) { - struct inode *dir = parent->d_inode; + struct nameidata nd; struct dentry *dentry; - struct inode *inode; + struct inode *dir, *inode; struct rpc_inode *rpci; - int error; - error = rpc_get_mount(); - if (error) - return ERR_PTR(error); - - down(&parent->d_inode->i_sem); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = rpc_lookup_negative(path, &nd); if (IS_ERR(dentry)) - goto out_unlock; - if (dentry->d_inode) { - dentry = ERR_PTR(-EEXIST); - goto out_dput; - } - - inode = rpc_get_inode(parent->d_inode->i_sb, - S_IFSOCK | S_IRUSR | S_IWUSR); - if (!inode) { - dentry = ERR_PTR(-ENOMEM); - goto out_dput; - } - + return dentry; + dir = nd.dentry->d_inode; + inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR); + if (!inode) + goto err_dput; inode->i_ino = iunique(dir->i_sb, 100); inode->i_fop = &rpc_pipe_fops; - + d_instantiate(dentry, inode); rpci = RPC_I(inode); rpci->private = private; rpci->flags = flags; rpci->ops = ops; - - d_instantiate(dentry, inode); - dget(dentry); - up(&parent->d_inode->i_sem); - inode_dir_notify(dir, DN_CREATE); +out: + up(&dir->i_sem); + rpc_release_path(&nd); return dentry; - - out_dput: +err_dput: dput(dentry); - out_unlock: - up(&parent->d_inode->i_sem); - rpc_put_mount(); - return dentry; + dentry = ERR_PTR(-ENOMEM); + printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, -ENOMEM); + goto out; } -void -rpc_unlink(struct dentry *dentry) +int +rpc_unlink(char *path) { - struct dentry *parent = dentry->d_parent; + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; - down(&parent->d_inode->i_sem); + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); rpc_inode_setowner(dentry->d_inode, NULL); - simple_unlink(parent->d_inode, dentry); + error = simple_unlink(dir, dentry); } - up(&parent->d_inode->i_sem); - - inode_dir_notify(parent->d_inode, DN_DELETE); - rpc_put_mount(); + dput(dentry); + inode_dir_notify(dir, DN_DELETE); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; } /* From 6cd7525a00f3b926e8bd2e402954ed3e09a8e924 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 22 Sep 2005 21:24:59 -0400 Subject: [PATCH 038/126] SUNRPC: fix bug in patch "portmapper doesn't need a reserved port" The in-kernel portmapper does in fact need a reserved port when registering new services, but not when performing bind queries. Ensure that we distinguish between the two cases. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/pmap_clnt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index d8e3f220002b..a398575f94b8 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -26,7 +26,7 @@ #define PMAP_GETPORT 3 static struct rpc_procinfo pmap_procedures[]; -static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int); +static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int, int); static void pmap_getport_done(struct rpc_task *); static struct rpc_program pmap_program; static DEFINE_SPINLOCK(pmap_lock); @@ -65,7 +65,7 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt) map->pm_binding = 1; spin_unlock(&pmap_lock); - pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot); + pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot, 0); if (IS_ERR(pmap_clnt)) { task->tk_status = PTR_ERR(pmap_clnt); goto bailout; @@ -112,7 +112,7 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot) NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); - pmap_clnt = pmap_create(hostname, sin, prot); + pmap_clnt = pmap_create(hostname, sin, prot, 0); if (IS_ERR(pmap_clnt)) return PTR_ERR(pmap_clnt); @@ -171,7 +171,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP); + pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1); if (IS_ERR(pmap_clnt)) { error = PTR_ERR(pmap_clnt); dprintk("RPC: couldn't create pmap client. Error = %d\n", error); @@ -198,7 +198,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) } static struct rpc_clnt * -pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) +pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileged) { struct rpc_xprt *xprt; struct rpc_clnt *clnt; @@ -208,7 +208,8 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; xprt->addr.sin_port = htons(RPC_PMAP_PORT); - xprt->resvport = 0; + if (!privileged) + xprt->resvport = 0; /* printk("pmap: create clnt\n"); */ clnt = rpc_new_client(xprt, hostname, From ea635a517e350eb03ab5f01618417f31b82a9a4d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 6 Oct 2005 23:12:58 -0400 Subject: [PATCH 039/126] SUNRPC: Retry rpcbind requests if the server's portmapper isn't up After a server crash/reboot, rebinding should always retry, otherwise requests on "hard" mounts will fail when they shouldn't. Test plan: Run a lock-intensive workload against a server while rebooting the server repeatedly. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5a8f01d726e9..a5f7029b1daa 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -759,7 +759,8 @@ call_bind_status(struct rpc_task *task) case -EACCES: dprintk("RPC: %4d remote rpcbind: RPC program/version unavailable\n", task->tk_pid); - break; + rpc_delay(task, 3*HZ); + goto retry_bind; case -ETIMEDOUT: dprintk("RPC: %4d rpcbind request timed out\n", task->tk_pid); From 5e5ce5be6f0161d2a069a4f8a1154fe639c5c02f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:11 -0700 Subject: [PATCH 040/126] RPC: allow call_encode() to delay transmission of an RPC call. Currently, call_encode will cause the entire RPC call to abort if it returns an error. This is unnecessarily rigid, and gets in the way of attempts to allow the NFSv4 layer to order RPC calls that carry sequence ids. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 23 ++++++++++++----------- net/sunrpc/xprt.c | 8 ++++++++ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 99cad3ead81d..068e1fb0868b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -211,6 +211,7 @@ int xprt_reserve_xprt(struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); +void xprt_abort_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a5f7029b1daa..534274056329 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -678,13 +678,11 @@ call_allocate(struct rpc_task *task) static void call_encode(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_buf *sndbuf = &req->rq_snd_buf; struct xdr_buf *rcvbuf = &req->rq_rcv_buf; unsigned int bufsiz; kxdrproc_t encode; - int status; u32 *p; dprintk("RPC: %4d call_encode (status %d)\n", @@ -712,12 +710,9 @@ call_encode(struct rpc_task *task) rpc_exit(task, -EIO); return; } - if (encode && (status = rpcauth_wrap_req(task, encode, req, p, - task->tk_msg.rpc_argp)) < 0) { - printk(KERN_WARNING "%s: can't encode arguments: %d\n", - clnt->cl_protname, -status); - rpc_exit(task, status); - } + if (encode != NULL) + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); } /* @@ -865,10 +860,12 @@ call_transmit(struct rpc_task *task) if (task->tk_status != 0) return; /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (!task->tk_rqstp->rq_bytes_sent) + if (task->tk_rqstp->rq_bytes_sent == 0) { call_encode(task); - if (task->tk_status < 0) - return; + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) + goto out_nosend; + } xprt_transmit(task); if (task->tk_status < 0) return; @@ -876,6 +873,10 @@ call_transmit(struct rpc_task *task) task->tk_action = NULL; rpc_wake_up_task(task); } + return; +out_nosend: + /* release socket write lock before attempting to handle error */ + xprt_abort_transmit(task); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 215be0d0ef6b..1ba55dc38b7a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -709,6 +709,14 @@ int xprt_prepare_transmit(struct rpc_task *task) return err; } +void +xprt_abort_transmit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + xprt_release_write(xprt, task); +} + /** * xprt_transmit - send an RPC request on a transport * @task: controlling RPC task From cee54fc944422c44e476736c045a9e8053cb0644 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:12 -0700 Subject: [PATCH 041/126] NFSv4: Add functions to order RPC calls NFSv4 file state-changing functions such as OPEN, CLOSE, LOCK,... are all labelled with "sequence identifiers" in order to prevent the server from reordering RPC requests, as this could cause its file state to become out of sync with the client. Currently the NFS client code enforces this ordering locally using semaphores to restrict access to structures until the RPC call is done. This, of course, only works with synchronous RPC calls, since the user process must first grab the semaphore. By dropping semaphores, and instead teaching the RPC engine to hold the RPC calls until they are ready to be sent, we can extend this process to work nicely with asynchronous RPC calls too. This patch adds a new list called "rpc_sequence" that defines the order of the RPC calls to be sent. We add one such list for each state_owner. When an RPC call is ready to be sent, it checks if it is top of the rpc_sequence list. If so, it proceeds. If not, it goes back to sleep, and loops until it hits top of the list. Once the RPC call has completed, it can then bump the sequence id counter, and remove itself from the rpc_sequence list, and then wake up the next sleeper. Note that the state_owner sequence ids and lock_owner sequence ids are all indexed to the same rpc_sequence list, so OPEN, LOCK,... requests are all ordered w.r.t. each other. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 42 ++++++++++-- fs/nfs/nfs4proc.c | 111 ++++++++++++++++++++----------- fs/nfs/nfs4state.c | 143 +++++++++++++++++++++++++++++++++------- fs/nfs/nfs4xdr.c | 43 +++++++++--- include/linux/nfs_xdr.h | 15 +++-- 5 files changed, 272 insertions(+), 82 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ec1a22d7b876..6ac6708484f5 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -92,6 +92,35 @@ struct nfs4_client { unsigned char cl_id_uniquifier; }; +/* + * struct rpc_sequence ensures that RPC calls are sent in the exact + * order that they appear on the list. + */ +struct rpc_sequence { + struct rpc_wait_queue wait; /* RPC call delay queue */ + spinlock_t lock; /* Protects the list */ + struct list_head list; /* Defines sequence of RPC calls */ +}; + +#define NFS_SEQID_CONFIRMED 1 +struct nfs_seqid_counter { + struct rpc_sequence *sequence; + int flags; + u32 counter; +}; + +struct nfs_seqid { + struct list_head list; + struct nfs_seqid_counter *sequence; + struct rpc_task *task; +}; + +static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) +{ + if (seqid_mutating_err(-status)) + seqid->flags |= NFS_SEQID_CONFIRMED; +} + /* * NFS4 state_owners and lock_owners are simply labels for ordered * sequences of RPC calls. Their sole purpose is to provide once-only @@ -106,12 +135,13 @@ struct nfs4_state_owner { struct nfs4_client *so_client; u32 so_id; /* 32-bit identifier, unique */ struct semaphore so_sema; - u32 so_seqid; /* protected by so_sema */ atomic_t so_count; struct rpc_cred *so_cred; /* Associated cred */ struct list_head so_states; struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; }; /* @@ -132,7 +162,7 @@ struct nfs4_lock_state { fl_owner_t ls_owner; /* POSIX lock owner */ #define NFS_LOCK_INITIALIZED 1 int ls_flags; - u32 ls_seqid; + struct nfs_seqid_counter ls_seqid; u32 ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; @@ -224,12 +254,16 @@ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, mode_t); extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode); -extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp); extern void nfs4_schedule_state_recovery(struct nfs4_client *); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); +extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); +extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_free_seqid(struct nfs_seqid *seqid); + extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9701ca8c9428..9ba89e7cdd28 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -218,7 +218,6 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st struct nfs_delegation *delegation = NFS_I(inode)->delegation; struct nfs_openargs o_arg = { .fh = NFS_FH(inode), - .seqid = sp->so_seqid, .id = sp->so_id, .open_flags = state->state, .clientid = server->nfs4_state->cl_clientid, @@ -245,8 +244,13 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st } o_arg.u.delegation_type = delegation->type; } + o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + if (o_arg.seqid == NULL) + return -ENOMEM; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_seqid(status, sp); + /* Confirm the sequence as being established */ + nfs_confirm_seqid(&sp->so_seqid, status); + nfs_increment_open_seqid(status, o_arg.seqid); if (status == 0) { memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); if (o_res.delegation_type != 0) { @@ -256,6 +260,7 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st nfs_async_inode_return_delegation(inode, &o_res.stateid); } } + nfs_free_seqid(o_arg.seqid); clear_bit(NFS_DELEGATED_STATE, &state->flags); /* Ensure we update the inode attributes */ NFS_CACHEINV(inode); @@ -307,16 +312,20 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state goto out; if (state->state == 0) goto out; - arg.seqid = sp->so_seqid; + arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + status = -ENOMEM; + if (arg.seqid == NULL) + goto out; arg.open_flags = state->state; memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data)); status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_seqid(status, sp); + nfs_increment_open_seqid(status, arg.seqid); if (status >= 0) { memcpy(state->stateid.data, res.stateid.data, sizeof(state->stateid.data)); clear_bit(NFS_DELEGATED_STATE, &state->flags); } + nfs_free_seqid(arg.seqid); out: up(&sp->so_sema); dput(parent); @@ -345,11 +354,11 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) return err; } -static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid) +static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid) { struct nfs_open_confirmargs arg = { .fh = fh, - .seqid = sp->so_seqid, + .seqid = seqid, .stateid = *stateid, }; struct nfs_open_confirmres res; @@ -362,7 +371,9 @@ static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nf int status; status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR); - nfs4_increment_seqid(status, sp); + /* Confirm the sequence as being established */ + nfs_confirm_seqid(&sp->so_seqid, status); + nfs_increment_open_seqid(status, seqid); if (status >= 0) memcpy(stateid, &res.stateid, sizeof(*stateid)); return status; @@ -380,21 +391,21 @@ static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, stru int status; /* Update sequence id. The caller must serialize! */ - o_arg->seqid = sp->so_seqid; o_arg->id = sp->so_id; o_arg->clientid = sp->so_client->cl_clientid; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_seqid(status, sp); + nfs_increment_open_seqid(status, o_arg->seqid); if (status != 0) goto out; update_changeattr(dir, &o_res->cinfo); if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { status = _nfs4_proc_open_confirm(server->client, &o_res->fh, - sp, &o_res->stateid); + sp, &o_res->stateid, o_arg->seqid); if (status != 0) goto out; } + nfs_confirm_seqid(&sp->so_seqid, 0); if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); out: @@ -465,6 +476,10 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st set_bit(NFS_DELEGATED_STATE, &state->flags); goto out; } + o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + status = -ENOMEM; + if (o_arg.seqid == NULL) + goto out; status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_nodeleg; @@ -490,6 +505,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res); } out_nodeleg: + nfs_free_seqid(o_arg.seqid); clear_bit(NFS_DELEGATED_STATE, &state->flags); out: dput(parent); @@ -667,6 +683,9 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st /* Serialization for the sequence id */ down(&sp->so_sema); + o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + if (o_arg.seqid == NULL) + return -ENOMEM; status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_err; @@ -681,6 +700,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st update_open_stateid(state, &o_res.stateid, flags); if (o_res.delegation_type != 0) nfs_inode_set_delegation(inode, cred, &o_res); + nfs_free_seqid(o_arg.seqid); up(&sp->so_sema); nfs4_put_state_owner(sp); up_read(&clp->cl_sem); @@ -690,6 +710,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st if (sp != NULL) { if (state != NULL) nfs4_put_open_state(state); + nfs_free_seqid(o_arg.seqid); up(&sp->so_sema); nfs4_put_state_owner(sp); } @@ -718,7 +739,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, * It is actually a sign of a bug on the client or on the server. * * If we receive a BAD_SEQID error in the particular case of - * doing an OPEN, we assume that nfs4_increment_seqid() will + * doing an OPEN, we assume that nfs_increment_open_seqid() will * have unhashed the old state_owner for us, and that we can * therefore safely retry using a new one. We should still warn * the user though... @@ -799,7 +820,7 @@ static void nfs4_close_done(struct rpc_task *task) /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ - nfs4_increment_seqid(task->tk_status, sp); + nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: memcpy(&state->stateid, &calldata->res.stateid, @@ -818,6 +839,7 @@ static void nfs4_close_done(struct rpc_task *task) } state->state = calldata->arg.open_flags; nfs4_put_open_state(state); + nfs_free_seqid(calldata->arg.seqid); up(&sp->so_sema); nfs4_put_state_owner(sp); up_read(&server->nfs4_state->cl_sem); @@ -865,7 +887,11 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) calldata->state = state; calldata->arg.fh = NFS_FH(inode); /* Serialization for the sequence id */ - calldata->arg.seqid = state->owner->so_seqid; + calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); + if (calldata->arg.seqid == NULL) { + kfree(calldata); + return -ENOMEM; + } calldata->arg.open_flags = mode; memcpy(&calldata->arg.stateid, &state->stateid, sizeof(calldata->arg.stateid)); @@ -2729,15 +2755,19 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock /* We might have lost the locks! */ if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) goto out; - luargs.seqid = lsp->ls_seqid; - memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); + luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid); + status = -ENOMEM; + if (luargs.seqid == NULL) + goto out; + memcpy(luargs.stateid.data, lsp->ls_stateid.data, sizeof(luargs.stateid.data)); arg.u.locku = &luargs; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_lock_seqid(status, lsp); + nfs_increment_lock_seqid(status, luargs.seqid); if (status == 0) - memcpy(&lsp->ls_stateid, &res.u.stateid, - sizeof(lsp->ls_stateid)); + memcpy(lsp->ls_stateid.data, res.u.stateid.data, + sizeof(lsp->ls_stateid.data)); + nfs_free_seqid(luargs.seqid); out: up(&state->lock_sema); if (status == 0) @@ -2783,9 +2813,13 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r .reclaim = reclaim, .new_lock_owner = 0, }; - int status; + struct nfs_seqid *lock_seqid; + int status = -ENOMEM; - if (!(lsp->ls_flags & NFS_LOCK_INITIALIZED)) { + lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); + if (lock_seqid == NULL) + return -ENOMEM; + if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) { struct nfs4_state_owner *owner = state->owner; struct nfs_open_to_lock otl = { .lock_owner = { @@ -2793,39 +2827,40 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r }, }; - otl.lock_seqid = lsp->ls_seqid; + otl.lock_seqid = lock_seqid; otl.lock_owner.id = lsp->ls_id; memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid)); largs.u.open_lock = &otl; largs.new_lock_owner = 1; arg.u.lock = &largs; down(&owner->so_sema); - otl.open_seqid = owner->so_seqid; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - /* increment open_owner seqid on success, and - * seqid mutating errors */ - nfs4_increment_seqid(status, owner); - up(&owner->so_sema); - if (status == 0) { - lsp->ls_flags |= NFS_LOCK_INITIALIZED; - lsp->ls_seqid++; + otl.open_seqid = nfs_alloc_seqid(&owner->so_seqid); + if (otl.open_seqid != NULL) { + status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); + /* increment seqid on success, and seqid mutating errors */ + nfs_increment_open_seqid(status, otl.open_seqid); + nfs_free_seqid(otl.open_seqid); } + up(&owner->so_sema); + if (status == 0) + nfs_confirm_seqid(&lsp->ls_seqid, 0); } else { - struct nfs_exist_lock el = { - .seqid = lsp->ls_seqid, - }; + struct nfs_exist_lock el; memcpy(&el.stateid, &lsp->ls_stateid, sizeof(el.stateid)); largs.u.exist_lock = ⪙ arg.u.lock = &largs; + el.seqid = lock_seqid; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - /* increment seqid on success, and * seqid mutating errors*/ - nfs4_increment_lock_seqid(status, lsp); } + /* increment seqid on success, and seqid mutating errors*/ + nfs_increment_lock_seqid(status, lock_seqid); /* save the returned stateid. */ - if (status == 0) - memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid)); - else if (status == -NFS4ERR_DENIED) + if (status == 0) { + memcpy(lsp->ls_stateid.data, res.u.stateid.data, sizeof(lsp->ls_stateid.data)); + lsp->ls_flags |= NFS_LOCK_INITIALIZED; + } else if (status == -NFS4ERR_DENIED) status = -EAGAIN; + nfs_free_seqid(lock_seqid); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index afe587d82f1e..f535c219cf3a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -264,13 +264,16 @@ nfs4_alloc_state_owner(void) { struct nfs4_state_owner *sp; - sp = kmalloc(sizeof(*sp),GFP_KERNEL); + sp = kzalloc(sizeof(*sp),GFP_KERNEL); if (!sp) return NULL; init_MUTEX(&sp->so_sema); - sp->so_seqid = 0; /* arbitrary */ INIT_LIST_HEAD(&sp->so_states); INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); + INIT_LIST_HEAD(&sp->so_sequence.list); atomic_set(&sp->so_count, 1); return sp; } @@ -553,12 +556,10 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f struct nfs4_lock_state *lsp; struct nfs4_client *clp = state->owner->so_client; - lsp = kmalloc(sizeof(*lsp), GFP_KERNEL); + lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); if (lsp == NULL) return NULL; - lsp->ls_flags = 0; - lsp->ls_seqid = 0; /* arbitrary */ - memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data)); + lsp->ls_seqid.sequence = &state->owner->so_sequence; atomic_set(&lsp->ls_count, 1); lsp->ls_owner = fl_owner; spin_lock(&clp->cl_lock); @@ -673,29 +674,102 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f nfs4_put_lock_state(lsp); } -/* -* Called with state->lock_sema and clp->cl_sem held. -*/ -void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp) +struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) { - if (status == NFS_OK || seqid_mutating_err(-status)) - lsp->ls_seqid++; + struct rpc_sequence *sequence = counter->sequence; + struct nfs_seqid *new; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new != NULL) { + new->sequence = counter; + new->task = NULL; + spin_lock(&sequence->lock); + list_add_tail(&new->list, &sequence->list); + spin_unlock(&sequence->lock); + } + return new; +} + +void nfs_free_seqid(struct nfs_seqid *seqid) +{ + struct rpc_sequence *sequence = seqid->sequence->sequence; + struct rpc_task *next = NULL; + + spin_lock(&sequence->lock); + list_del(&seqid->list); + if (!list_empty(&sequence->list)) { + next = list_entry(sequence->list.next, struct nfs_seqid, list)->task; + if (next) + rpc_wake_up_task(next); + } + spin_unlock(&sequence->lock); + kfree(seqid); } /* -* Called with sp->so_sema and clp->cl_sem held. -* -* Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or -* failed with a seqid incrementing error - -* see comments nfs_fs.h:seqid_mutating_error() -*/ -void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp) + * Called with sp->so_sema and clp->cl_sem held. + * + * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or + * failed with a seqid incrementing error - + * see comments nfs_fs.h:seqid_mutating_error() + */ +static inline void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { - if (status == NFS_OK || seqid_mutating_err(-status)) - sp->so_seqid++; - /* If the server returns BAD_SEQID, unhash state_owner here */ - if (status == -NFS4ERR_BAD_SEQID) + switch (status) { + case 0: + break; + case -NFS4ERR_BAD_SEQID: + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_BADXDR: + case -NFS4ERR_RESOURCE: + case -NFS4ERR_NOFILEHANDLE: + /* Non-seqid mutating errors */ + return; + }; + /* + * Note: no locking needed as we are guaranteed to be first + * on the sequence list + */ + seqid->sequence->counter++; +} + +void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) +{ + if (status == -NFS4ERR_BAD_SEQID) { + struct nfs4_state_owner *sp = container_of(seqid->sequence, + struct nfs4_state_owner, so_seqid); nfs4_drop_state_owner(sp); + } + return nfs_increment_seqid(status, seqid); +} + +/* + * Called with ls->lock_sema and clp->cl_sem held. + * + * Increment the seqid if the LOCK/LOCKU succeeded, or + * failed with a seqid incrementing error - + * see comments nfs_fs.h:seqid_mutating_error() + */ +void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) +{ + return nfs_increment_seqid(status, seqid); +} + +int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) +{ + struct rpc_sequence *sequence = seqid->sequence->sequence; + int status = 0; + + spin_lock(&sequence->lock); + if (sequence->list.next != &seqid->list) { + seqid->task = task; + rpc_sleep_on(&sequence->wait, task, NULL, NULL); + status = -EAGAIN; + } + spin_unlock(&sequence->lock); + return status; } static int reclaimer(void *); @@ -791,8 +865,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n if (state->state == 0) continue; status = ops->recover_open(sp, state); - list_for_each_entry(lock, &state->lock_states, ls_locks) - lock->ls_flags &= ~NFS_LOCK_INITIALIZED; if (status >= 0) { status = nfs4_reclaim_locks(ops, state); if (status < 0) @@ -831,6 +903,26 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n return status; } +static void nfs4_state_mark_reclaim(struct nfs4_client *clp) +{ + struct nfs4_state_owner *sp; + struct nfs4_state *state; + struct nfs4_lock_state *lock; + + /* Reset all sequence ids to zero */ + list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + sp->so_seqid.counter = 0; + sp->so_seqid.flags = 0; + list_for_each_entry(state, &sp->so_states, open_states) { + list_for_each_entry(lock, &state->lock_states, ls_locks) { + lock->ls_seqid.counter = 0; + lock->ls_seqid.flags = 0; + lock->ls_flags &= ~NFS_LOCK_INITIALIZED; + } + } + } +} + static int reclaimer(void *ptr) { struct reclaimer_args *args = (struct reclaimer_args *)ptr; @@ -864,6 +956,7 @@ static int reclaimer(void *ptr) default: ops = &nfs4_network_partition_recovery_ops; }; + nfs4_state_mark_reclaim(clp); status = __nfs4_init_client(clp); if (status) goto out_error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 6c564ef9489e..fcd28a29a2f8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -604,7 +604,7 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) RESERVE_SPACE(8+sizeof(arg->stateid.data)); WRITE32(OP_CLOSE); - WRITE32(arg->seqid); + WRITE32(arg->seqid->sequence->counter); WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); return 0; @@ -732,9 +732,9 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg) struct nfs_open_to_lock *ol = opargs->u.open_lock; RESERVE_SPACE(40); - WRITE32(ol->open_seqid); + WRITE32(ol->open_seqid->sequence->counter); WRITEMEM(&ol->open_stateid, sizeof(ol->open_stateid)); - WRITE32(ol->lock_seqid); + WRITE32(ol->lock_seqid->sequence->counter); WRITE64(ol->lock_owner.clientid); WRITE32(4); WRITE32(ol->lock_owner.id); @@ -744,7 +744,7 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg) RESERVE_SPACE(20); WRITEMEM(&el->stateid, sizeof(el->stateid)); - WRITE32(el->seqid); + WRITE32(el->seqid->sequence->counter); } return 0; @@ -775,7 +775,7 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg) RESERVE_SPACE(44); WRITE32(OP_LOCKU); WRITE32(arg->type); - WRITE32(opargs->seqid); + WRITE32(opargs->seqid->sequence->counter); WRITEMEM(&opargs->stateid, sizeof(opargs->stateid)); WRITE64(arg->offset); WRITE64(arg->length); @@ -826,7 +826,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena */ RESERVE_SPACE(8); WRITE32(OP_OPEN); - WRITE32(arg->seqid); + WRITE32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->open_flags); RESERVE_SPACE(16); WRITE64(arg->clientid); @@ -941,7 +941,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con RESERVE_SPACE(8+sizeof(arg->stateid.data)); WRITE32(OP_OPEN_CONFIRM); WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); - WRITE32(arg->seqid); + WRITE32(arg->seqid->sequence->counter); return 0; } @@ -953,7 +953,7 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea RESERVE_SPACE(8+sizeof(arg->stateid.data)); WRITE32(OP_OPEN_DOWNGRADE); WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); - WRITE32(arg->seqid); + WRITE32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->open_flags); return 0; } @@ -1416,6 +1416,9 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos }; int status; + status = nfs_wait_on_sequence(args->seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1437,6 +1440,9 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena }; int status; + status = nfs_wait_on_sequence(args->seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1464,6 +1470,9 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n }; int status; + status = nfs_wait_on_sequence(args->seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1485,6 +1494,9 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nf }; int status; + status = nfs_wait_on_sequence(args->seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1506,6 +1518,9 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct }; int status; + status = nfs_wait_on_sequence(args->seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1525,8 +1540,17 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_locka struct compound_hdr hdr = { .nops = 2, }; + struct nfs_lock_opargs *opargs = args->u.lock; + struct nfs_seqid *seqid; int status; + if (opargs->new_lock_owner) + seqid = opargs->u.open_lock->lock_seqid; + else + seqid = opargs->u.exist_lock->seqid; + status = nfs_wait_on_sequence(seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1569,6 +1593,9 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_lock }; int status; + status = nfs_wait_on_sequence(args->u.locku->seqid, req->rq_task); + if (status != 0) + goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index a2bf6914ff1b..d578912bf9a9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -96,12 +96,13 @@ struct nfs4_change_info { u64 after; }; +struct nfs_seqid; /* * Arguments to the open call. */ struct nfs_openargs { const struct nfs_fh * fh; - __u32 seqid; + struct nfs_seqid * seqid; int open_flags; __u64 clientid; __u32 id; @@ -136,7 +137,7 @@ struct nfs_openres { struct nfs_open_confirmargs { const struct nfs_fh * fh; nfs4_stateid stateid; - __u32 seqid; + struct nfs_seqid * seqid; }; struct nfs_open_confirmres { @@ -149,7 +150,7 @@ struct nfs_open_confirmres { struct nfs_closeargs { struct nfs_fh * fh; nfs4_stateid stateid; - __u32 seqid; + struct nfs_seqid * seqid; int open_flags; }; @@ -165,15 +166,15 @@ struct nfs_lowner { }; struct nfs_open_to_lock { - __u32 open_seqid; + struct nfs_seqid * open_seqid; nfs4_stateid open_stateid; - __u32 lock_seqid; + struct nfs_seqid * lock_seqid; struct nfs_lowner lock_owner; }; struct nfs_exist_lock { nfs4_stateid stateid; - __u32 seqid; + struct nfs_seqid * seqid; }; struct nfs_lock_opargs { @@ -186,7 +187,7 @@ struct nfs_lock_opargs { }; struct nfs_locku_opargs { - __u32 seqid; + struct nfs_seqid * seqid; nfs4_stateid stateid; }; From 9512135df14f8293b9bc5e8fb22d4279dee5ff66 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:12 -0700 Subject: [PATCH 042/126] NFSv4: Fix a potential CLOSE race Once the state_owner and lock_owner semaphores get removed, it will be possible for other OPEN requests to reopen the same file if they have lower sequence ids than our CLOSE call. This patch ensures that we recheck the file state once nfs_wait_on_sequence() has completed waiting. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 110 ++++++++++++++++++++++++++++------------ fs/nfs/nfs4state.c | 6 ++- fs/nfs/nfs4xdr.c | 14 ++--- include/linux/nfs_xdr.h | 2 +- 4 files changed, 87 insertions(+), 45 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9ba89e7cdd28..5154ddf6d9a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -189,6 +189,21 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf nfsi->change_attr = cinfo->after; } +/* Helper for asynchronous RPC calls */ +static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin, + rpc_action tk_exit, void *calldata) +{ + struct rpc_task *task; + + if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC))) + return -ENOMEM; + + task->tk_calldata = calldata; + task->tk_action = tk_begin; + rpc_execute(task); + return 0; +} + static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) { struct inode *inode = state->inode; @@ -810,11 +825,24 @@ struct nfs4_closedata { struct nfs_closeres res; }; +static void nfs4_free_closedata(struct nfs4_closedata *calldata) +{ + struct nfs4_state *state = calldata->state; + struct nfs4_state_owner *sp = state->owner; + struct nfs_server *server = NFS_SERVER(calldata->inode); + + nfs4_put_open_state(calldata->state); + nfs_free_seqid(calldata->arg.seqid); + up(&sp->so_sema); + nfs4_put_state_owner(sp); + up_read(&server->nfs4_state->cl_sem); + kfree(calldata); +} + static void nfs4_close_done(struct rpc_task *task) { struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; struct nfs4_state *state = calldata->state; - struct nfs4_state_owner *sp = state->owner; struct nfs_server *server = NFS_SERVER(calldata->inode); /* hmm. we are done with the inode, and in the process of freeing @@ -838,25 +866,46 @@ static void nfs4_close_done(struct rpc_task *task) } } state->state = calldata->arg.open_flags; - nfs4_put_open_state(state); - nfs_free_seqid(calldata->arg.seqid); - up(&sp->so_sema); - nfs4_put_state_owner(sp); - up_read(&server->nfs4_state->cl_sem); - kfree(calldata); + nfs4_free_closedata(calldata); } -static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata) +static void nfs4_close_begin(struct rpc_task *task) { + struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; + struct nfs4_state *state = calldata->state; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], .rpc_argp = &calldata->arg, .rpc_resp = &calldata->res, - .rpc_cred = calldata->state->owner->so_cred, + .rpc_cred = state->owner->so_cred, }; - if (calldata->arg.open_flags != 0) + int mode = 0; + int status; + + status = nfs_wait_on_sequence(calldata->arg.seqid, task); + if (status != 0) + return; + /* Don't reorder reads */ + smp_rmb(); + /* Recalculate the new open mode in case someone reopened the file + * while we were waiting in line to be scheduled. + */ + if (state->nreaders != 0) + mode |= FMODE_READ; + if (state->nwriters != 0) + mode |= FMODE_WRITE; + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + state->state = mode; + if (mode == state->state) { + nfs4_free_closedata(calldata); + task->tk_exit = NULL; + rpc_exit(task, 0); + return; + } + if (mode != 0) msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata); + calldata->arg.open_flags = mode; + rpc_call_setup(task, &msg, 0); } /* @@ -873,35 +922,30 @@ static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata * int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) { struct nfs4_closedata *calldata; - int status; + int status = -ENOMEM; - /* Tell caller we're done */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { - state->state = mode; - return 0; - } - calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL); + calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); if (calldata == NULL) - return -ENOMEM; + goto out; calldata->inode = inode; calldata->state = state; calldata->arg.fh = NFS_FH(inode); + calldata->arg.stateid = &state->stateid; /* Serialization for the sequence id */ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); - if (calldata->arg.seqid == NULL) { - kfree(calldata); - return -ENOMEM; - } - calldata->arg.open_flags = mode; - memcpy(&calldata->arg.stateid, &state->stateid, - sizeof(calldata->arg.stateid)); - status = nfs4_close_call(NFS_SERVER(inode)->client, calldata); - /* - * Return -EINPROGRESS on success in order to indicate to the - * caller that an asynchronous RPC call has been launched, and - * that it will release the semaphores on completion. - */ - return (status == 0) ? -EINPROGRESS : status; + if (calldata->arg.seqid == NULL) + goto out_free_calldata; + + status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_close_begin, + nfs4_close_done, calldata); + if (status == 0) + goto out; + + nfs_free_seqid(calldata->arg.seqid); +out_free_calldata: + kfree(calldata); +out: + return status; } struct inode * diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f535c219cf3a..59c93f37e1b2 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -518,7 +518,11 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) newstate |= FMODE_WRITE; if (state->state == newstate) goto out; - if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS) + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + state->state = newstate; + goto out; + } + if (nfs4_do_close(inode, state, newstate) == 0) return; } out: diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fcd28a29a2f8..934ec50ea6bf 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -602,10 +602,10 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) { uint32_t *p; - RESERVE_SPACE(8+sizeof(arg->stateid.data)); + RESERVE_SPACE(8+sizeof(arg->stateid->data)); WRITE32(OP_CLOSE); WRITE32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); + WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data)); return 0; } @@ -950,9 +950,9 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea { uint32_t *p; - RESERVE_SPACE(8+sizeof(arg->stateid.data)); + RESERVE_SPACE(8+sizeof(arg->stateid->data)); WRITE32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); + WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data)); WRITE32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->open_flags); return 0; @@ -1416,9 +1416,6 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos }; int status; - status = nfs_wait_on_sequence(args->seqid, req->rq_task); - if (status != 0) - goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1518,9 +1515,6 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct }; int status; - status = nfs_wait_on_sequence(args->seqid, req->rq_task); - if (status != 0) - goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d578912bf9a9..cac0df950c66 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -149,7 +149,7 @@ struct nfs_open_confirmres { */ struct nfs_closeargs { struct nfs_fh * fh; - nfs4_stateid stateid; + nfs4_stateid * stateid; struct nfs_seqid * seqid; int open_flags; }; From e6dfa553cffcb9740f932311dff42f81d6ac63bb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:13 -0700 Subject: [PATCH 043/126] NFSv4: Remove obsolete state_owner and lock_owner semaphores OPEN, CLOSE, etc no longer need these semaphores to ensure ordering of requests. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 6 ------ fs/nfs/nfs4proc.c | 18 ------------------ fs/nfs/nfs4state.c | 15 ++++----------- 3 files changed, 4 insertions(+), 35 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6ac6708484f5..d4fcb5d0ce6c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -125,16 +125,11 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status * NFS4 state_owners and lock_owners are simply labels for ordered * sequences of RPC calls. Their sole purpose is to provide once-only * semantics by allowing the server to identify replayed requests. - * - * The ->so_sema is held during all state_owner seqid-mutating operations: - * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize - * so_seqid. */ struct nfs4_state_owner { struct list_head so_list; /* per-clientid list of state_owners */ struct nfs4_client *so_client; u32 so_id; /* 32-bit identifier, unique */ - struct semaphore so_sema; atomic_t so_count; struct rpc_cred *so_cred; /* Associated cred */ @@ -183,7 +178,6 @@ struct nfs4_state { struct inode *inode; /* Pointer to the inode */ unsigned long flags; /* Do we hold any locks? */ - struct semaphore lock_sema; /* Serializes file locking operations */ spinlock_t state_lock; /* Protects the lock_states list */ nfs4_stateid stateid; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5154ddf6d9a5..25bab6032dfe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -224,7 +224,6 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, /* * OPEN_RECLAIM: * reclaim state on the server after a reboot. - * Assumes caller is holding the sp->so_sem */ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) { @@ -322,7 +321,6 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state }; int status = 0; - down(&sp->so_sema); if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) goto out; if (state->state == 0) @@ -342,7 +340,6 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state } nfs_free_seqid(arg.seqid); out: - up(&sp->so_sema); dput(parent); return status; } @@ -595,7 +592,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); goto out_err; } - down(&sp->so_sema); state = nfs4_get_open_state(inode, sp); if (state == NULL) goto out_err; @@ -620,7 +616,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred set_bit(NFS_DELEGATED_STATE, &state->flags); update_open_stateid(state, &delegation->stateid, open_flags); out_ok: - up(&sp->so_sema); nfs4_put_state_owner(sp); up_read(&nfsi->rwsem); up_read(&clp->cl_sem); @@ -631,7 +626,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred if (sp != NULL) { if (state != NULL) nfs4_put_open_state(state); - up(&sp->so_sema); nfs4_put_state_owner(sp); } up_read(&nfsi->rwsem); @@ -696,7 +690,6 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st } else o_arg.u.attrs = sattr; /* Serialization for the sequence id */ - down(&sp->so_sema); o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); if (o_arg.seqid == NULL) @@ -716,7 +709,6 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st if (o_res.delegation_type != 0) nfs_inode_set_delegation(inode, cred, &o_res); nfs_free_seqid(o_arg.seqid); - up(&sp->so_sema); nfs4_put_state_owner(sp); up_read(&clp->cl_sem); *res = state; @@ -726,7 +718,6 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st if (state != NULL) nfs4_put_open_state(state); nfs_free_seqid(o_arg.seqid); - up(&sp->so_sema); nfs4_put_state_owner(sp); } /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */ @@ -833,7 +824,6 @@ static void nfs4_free_closedata(struct nfs4_closedata *calldata) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); - up(&sp->so_sema); nfs4_put_state_owner(sp); up_read(&server->nfs4_state->cl_sem); kfree(calldata); @@ -2702,7 +2692,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock down_read(&clp->cl_sem); nlo.clientid = clp->cl_clientid; - down(&state->lock_sema); status = nfs4_set_lock_state(state, request); if (status != 0) goto out; @@ -2729,7 +2718,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = 0; } out: - up(&state->lock_sema); up_read(&clp->cl_sem); return status; } @@ -2791,7 +2779,6 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock int status; down_read(&clp->cl_sem); - down(&state->lock_sema); status = nfs4_set_lock_state(state, request); if (status != 0) goto out; @@ -2813,7 +2800,6 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock sizeof(lsp->ls_stateid.data)); nfs_free_seqid(luargs.seqid); out: - up(&state->lock_sema); if (status == 0) do_vfs_lock(request->fl_file, request); up_read(&clp->cl_sem); @@ -2877,7 +2863,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r largs.u.open_lock = &otl; largs.new_lock_owner = 1; arg.u.lock = &largs; - down(&owner->so_sema); otl.open_seqid = nfs_alloc_seqid(&owner->so_seqid); if (otl.open_seqid != NULL) { status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); @@ -2885,7 +2870,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r nfs_increment_open_seqid(status, otl.open_seqid); nfs_free_seqid(otl.open_seqid); } - up(&owner->so_sema); if (status == 0) nfs_confirm_seqid(&lsp->ls_seqid, 0); } else { @@ -2944,11 +2928,9 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock int status; down_read(&clp->cl_sem); - down(&state->lock_sema); status = nfs4_set_lock_state(state, request); if (status == 0) status = _nfs4_do_setlk(state, cmd, request, 0); - up(&state->lock_sema); if (status == 0) { /* Note: we always want to sleep here! */ request->fl_flags |= FL_SLEEP; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 59c93f37e1b2..86c08c165ce7 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -267,7 +267,6 @@ nfs4_alloc_state_owner(void) sp = kzalloc(sizeof(*sp),GFP_KERNEL); if (!sp) return NULL; - init_MUTEX(&sp->so_sema); INIT_LIST_HEAD(&sp->so_states); INIT_LIST_HEAD(&sp->so_delegations); rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); @@ -362,7 +361,6 @@ nfs4_alloc_open_state(void) memset(state->stateid.data, 0, sizeof(state->stateid.data)); atomic_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); - init_MUTEX(&state->lock_sema); spin_lock_init(&state->state_lock); return state; } @@ -444,7 +442,6 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = __nfs4_find_state_byowner(inode, owner); if (state == NULL && new != NULL) { state = new; - /* Caller *must* be holding owner->so_sem */ /* Note: The reclaim code dictates that we add stateless * and read-only stateids to the end of the list */ list_add_tail(&state->open_states, &owner->so_states); @@ -464,7 +461,7 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) /* * Beware! Caller must be holding exactly one - * reference to clp->cl_sem and owner->so_sema! + * reference to clp->cl_sem! */ void nfs4_put_open_state(struct nfs4_state *state) { @@ -485,7 +482,6 @@ void nfs4_put_open_state(struct nfs4_state *state) /* * Beware! Caller must be holding no references to clp->cl_sem! - * of owner->so_sema! */ void nfs4_close_state(struct nfs4_state *state, mode_t mode) { @@ -496,7 +492,6 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) atomic_inc(&owner->so_count); down_read(&clp->cl_sem); - down(&owner->so_sema); /* Protect against nfs4_find_state() */ spin_lock(&inode->i_lock); if (mode & FMODE_READ) @@ -527,7 +522,6 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) } out: nfs4_put_open_state(state); - up(&owner->so_sema); nfs4_put_state_owner(owner); up_read(&clp->cl_sem); } @@ -553,7 +547,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. * - * The caller must be holding state->lock_sema */ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) { @@ -577,7 +570,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. * - * The caller must be holding state->lock_sema and clp->cl_sem + * The caller must be holding clp->cl_sem */ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) { @@ -711,7 +704,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid) } /* - * Called with sp->so_sema and clp->cl_sem held. + * Called with clp->cl_sem held. * * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - @@ -750,7 +743,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) } /* - * Called with ls->lock_sema and clp->cl_sem held. + * Called with clp->cl_sem held. * * Increment the seqid if the LOCK/LOCKU succeeded, or * failed with a seqid incrementing error - From 83c9d41e456033000ccfadf535f3098d8739488d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:13 -0700 Subject: [PATCH 044/126] NFSv4: Remove nfs4_client->cl_sem from close() path We no longer need to worry about collisions between close() and the state recovery code, since the new close will automatically recheck the file state once it is done waiting on its sequence slot. Ditto for the nfs4_proc_locku() procedure. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 ----- fs/nfs/nfs4state.c | 9 +-------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 25bab6032dfe..611052bacaff 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -820,12 +820,10 @@ static void nfs4_free_closedata(struct nfs4_closedata *calldata) { struct nfs4_state *state = calldata->state; struct nfs4_state_owner *sp = state->owner; - struct nfs_server *server = NFS_SERVER(calldata->inode); nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - up_read(&server->nfs4_state->cl_sem); kfree(calldata); } @@ -2758,7 +2756,6 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock { struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_client *clp = server->nfs4_state; struct nfs_lockargs arg = { .fh = NFS_FH(inode), .type = nfs4_lck_type(cmd, request), @@ -2778,7 +2775,6 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock struct nfs_locku_opargs luargs; int status; - down_read(&clp->cl_sem); status = nfs4_set_lock_state(state, request); if (status != 0) goto out; @@ -2802,7 +2798,6 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock out: if (status == 0) do_vfs_lock(request->fl_file, request); - up_read(&clp->cl_sem); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 86c08c165ce7..bb3574361958 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -481,17 +481,15 @@ void nfs4_put_open_state(struct nfs4_state *state) } /* - * Beware! Caller must be holding no references to clp->cl_sem! + * Close the current file. */ void nfs4_close_state(struct nfs4_state *state, mode_t mode) { struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - struct nfs4_client *clp = owner->so_client; int newstate; atomic_inc(&owner->so_count); - down_read(&clp->cl_sem); /* Protect against nfs4_find_state() */ spin_lock(&inode->i_lock); if (mode & FMODE_READ) @@ -523,7 +521,6 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) out: nfs4_put_open_state(state); nfs4_put_state_owner(owner); - up_read(&clp->cl_sem); } /* @@ -704,8 +701,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid) } /* - * Called with clp->cl_sem held. - * * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - * see comments nfs_fs.h:seqid_mutating_error() @@ -743,8 +738,6 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) } /* - * Called with clp->cl_sem held. - * * Increment the seqid if the LOCK/LOCKU succeeded, or * failed with a seqid incrementing error - * see comments nfs_fs.h:seqid_mutating_error() From 0a8838f972883112f0a7b259141b24db17583c2d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:14 -0700 Subject: [PATCH 045/126] NFSv4: Add missing handling of OPEN_CONFIRM requests on CLAIM_DELEGATE_CUR. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 611052bacaff..f57dba815099 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -56,6 +56,7 @@ #define NFS4_POLL_RETRY_MIN (1*HZ) #define NFS4_POLL_RETRY_MAX (15*HZ) +static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *); static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); @@ -333,11 +334,21 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data)); status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); nfs_increment_open_seqid(status, arg.seqid); + if (status != 0) + goto out_free; + if(res.rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode), + sp, &res.stateid, arg.seqid); + if (status != 0) + goto out_free; + } + nfs_confirm_seqid(&sp->so_seqid, 0); if (status >= 0) { memcpy(state->stateid.data, res.stateid.data, sizeof(state->stateid.data)); clear_bit(NFS_DELEGATED_STATE, &state->flags); } +out_free: nfs_free_seqid(arg.seqid); out: dput(parent); @@ -366,7 +377,7 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) return err; } -static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid) +static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid) { struct nfs_open_confirmargs arg = { .fh = fh, From faf5f49c2d9c0af2847837c232a432cc146e203b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:15 -0700 Subject: [PATCH 046/126] NFSv4: Make NFS clean up byte range locks asynchronously Currently we fail to do so if the process was signalled. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 165 +++++++++++++++++++++++++++------------- fs/nfs/nfs4state.c | 2 +- fs/nfs/nfs4xdr.c | 9 +-- include/linux/nfs_xdr.h | 2 +- 5 files changed, 118 insertions(+), 61 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index d4fcb5d0ce6c..2215cdee43ae 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -249,6 +249,7 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, mode_t); extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode); extern void nfs4_schedule_state_recovery(struct nfs4_client *); +extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f57dba815099..612a9a14aed3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -58,9 +58,9 @@ static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); -static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception); +static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; @@ -2422,7 +2422,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen } static int -nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) { struct nfs4_client *clp = server->nfs4_state; @@ -2500,7 +2500,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) /* This is the error handling routine for processes that are allowed * to sleep. */ -int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { struct nfs4_client *clp = server->nfs4_state; int ret = errorcode; @@ -2763,66 +2763,125 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) return res; } -static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) +struct nfs4_unlockdata { + struct nfs_lockargs arg; + struct nfs_locku_opargs luargs; + struct nfs_lockres res; + struct nfs4_lock_state *lsp; + struct nfs_open_context *ctx; + atomic_t refcount; + struct completion completion; +}; + +static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata) { - struct inode *inode = state->inode; - struct nfs_server *server = NFS_SERVER(inode); - struct nfs_lockargs arg = { - .fh = NFS_FH(inode), - .type = nfs4_lck_type(cmd, request), - .offset = request->fl_start, - .length = nfs4_lck_length(request), - }; - struct nfs_lockres res = { - .server = server, - }; + if (atomic_dec_and_test(&calldata->refcount)) { + nfs_free_seqid(calldata->luargs.seqid); + nfs4_put_lock_state(calldata->lsp); + put_nfs_open_context(calldata->ctx); + kfree(calldata); + } +} + +static void nfs4_locku_complete(struct nfs4_unlockdata *calldata) +{ + complete(&calldata->completion); + nfs4_locku_release_calldata(calldata); +} + +static void nfs4_locku_done(struct rpc_task *task) +{ + struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; + + nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid); + switch (task->tk_status) { + case 0: + memcpy(calldata->lsp->ls_stateid.data, + calldata->res.u.stateid.data, + sizeof(calldata->lsp->ls_stateid.data)); + break; + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_schedule_state_recovery(calldata->res.server->nfs4_state); + break; + default: + if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) { + rpc_restart_call(task); + return; + } + } + nfs4_locku_complete(calldata); +} + +static void nfs4_locku_begin(struct rpc_task *task) +{ + struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], - .rpc_argp = &arg, - .rpc_resp = &res, - .rpc_cred = state->owner->so_cred, + .rpc_argp = &calldata->arg, + .rpc_resp = &calldata->res, + .rpc_cred = calldata->lsp->ls_state->owner->so_cred, }; - struct nfs4_lock_state *lsp; - struct nfs_locku_opargs luargs; int status; - - status = nfs4_set_lock_state(state, request); - if (status != 0) - goto out; - lsp = request->fl_u.nfs4_fl.owner; - /* We might have lost the locks! */ - if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) - goto out; - luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid); - status = -ENOMEM; - if (luargs.seqid == NULL) - goto out; - memcpy(luargs.stateid.data, lsp->ls_stateid.data, sizeof(luargs.stateid.data)); - arg.u.locku = &luargs; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs_increment_lock_seqid(status, luargs.seqid); - if (status == 0) - memcpy(lsp->ls_stateid.data, res.u.stateid.data, - sizeof(lsp->ls_stateid.data)); - nfs_free_seqid(luargs.seqid); -out: - if (status == 0) - do_vfs_lock(request->fl_file, request); - return status; + status = nfs_wait_on_sequence(calldata->luargs.seqid, task); + if (status != 0) + return; + if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { + nfs4_locku_complete(calldata); + task->tk_exit = NULL; + rpc_exit(task, 0); + return; + } + rpc_call_setup(task, &msg, 0); } static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_exception exception = { }; - int err; + struct nfs4_unlockdata *calldata; + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_lock_state *lsp; + int status; - do { - err = nfs4_handle_exception(NFS_SERVER(state->inode), - _nfs4_proc_unlck(state, cmd, request), - &exception); - } while (exception.retry); - return err; + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + lsp = request->fl_u.nfs4_fl.owner; + /* We might have lost the locks! */ + if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) + return 0; + calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); + if (calldata == NULL) + return -ENOMEM; + calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid); + if (calldata->luargs.seqid == NULL) { + kfree(calldata); + return -ENOMEM; + } + calldata->luargs.stateid = &lsp->ls_stateid; + calldata->arg.fh = NFS_FH(inode); + calldata->arg.type = nfs4_lck_type(cmd, request); + calldata->arg.offset = request->fl_start; + calldata->arg.length = nfs4_lck_length(request); + calldata->arg.u.locku = &calldata->luargs; + calldata->res.server = server; + calldata->lsp = lsp; + atomic_inc(&lsp->ls_count); + + /* Ensure we don't close file until we're done freeing locks! */ + calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data); + + atomic_set(&calldata->refcount, 2); + init_completion(&calldata->completion); + + status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin, + nfs4_locku_done, calldata); + if (status == 0) + wait_for_completion_interruptible(&calldata->completion); + do_vfs_lock(request->fl_file, request); + nfs4_locku_release_calldata(calldata); + return status; } static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index bb3574361958..23834c8fb740 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -600,7 +600,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ * Release reference to lock_state, and free it if we see that * it is no longer in use */ -static void nfs4_put_lock_state(struct nfs4_lock_state *lsp) +void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { struct nfs4_state *state; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 934ec50ea6bf..4706192cfb07 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -776,7 +776,7 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg) WRITE32(OP_LOCKU); WRITE32(arg->type); WRITE32(opargs->seqid->sequence->counter); - WRITEMEM(&opargs->stateid, sizeof(opargs->stateid)); + WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data)); WRITE64(arg->offset); WRITE64(arg->length); @@ -1587,9 +1587,6 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_lock }; int status; - status = nfs_wait_on_sequence(args->u.locku->seqid, req->rq_task); - if (status != 0) - goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -2934,8 +2931,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res) status = decode_op_hdr(xdr, OP_LOCKU); if (status == 0) { - READ_BUF(sizeof(nfs4_stateid)); - COPYMEM(&res->u.stateid, sizeof(res->u.stateid)); + READ_BUF(sizeof(res->u.stateid.data)); + COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data)); } return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index cac0df950c66..849f95c5fae4 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -188,7 +188,7 @@ struct nfs_lock_opargs { struct nfs_locku_opargs { struct nfs_seqid * seqid; - nfs4_stateid stateid; + nfs4_stateid * stateid; }; struct nfs_lockargs { From 06735b3454824bd561decbde46111f144e905923 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:15 -0700 Subject: [PATCH 047/126] NFSv4: Fix up handling of open_to_lock sequence ids Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 69 +++++++++++++++++++---------------------- fs/nfs/nfs4xdr.c | 32 ++++++++----------- include/linux/nfs_xdr.h | 21 +++---------- 3 files changed, 50 insertions(+), 72 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 612a9a14aed3..35da15342e05 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2889,11 +2889,23 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(inode); struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_lock_opargs largs = { + .lock_stateid = &lsp->ls_stateid, + .open_stateid = &state->stateid, + .lock_owner = { + .clientid = server->nfs4_state->cl_clientid, + .id = lsp->ls_id, + }, + .reclaim = reclaim, + }; struct nfs_lockargs arg = { .fh = NFS_FH(inode), .type = nfs4_lck_type(cmd, request), .offset = request->fl_start, .length = nfs4_lck_length(request), + .u = { + .lock = &largs, + }, }; struct nfs_lockres res = { .server = server, @@ -2904,56 +2916,39 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r .rpc_resp = &res, .rpc_cred = state->owner->so_cred, }; - struct nfs_lock_opargs largs = { - .reclaim = reclaim, - .new_lock_owner = 0, - }; - struct nfs_seqid *lock_seqid; int status = -ENOMEM; - lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); - if (lock_seqid == NULL) + largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); + if (largs.lock_seqid == NULL) return -ENOMEM; if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) { struct nfs4_state_owner *owner = state->owner; - struct nfs_open_to_lock otl = { - .lock_owner = { - .clientid = server->nfs4_state->cl_clientid, - }, - }; - otl.lock_seqid = lock_seqid; - otl.lock_owner.id = lsp->ls_id; - memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid)); - largs.u.open_lock = &otl; + largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid); + if (largs.open_seqid == NULL) + goto out; largs.new_lock_owner = 1; - arg.u.lock = &largs; - otl.open_seqid = nfs_alloc_seqid(&owner->so_seqid); - if (otl.open_seqid != NULL) { - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - /* increment seqid on success, and seqid mutating errors */ - nfs_increment_open_seqid(status, otl.open_seqid); - nfs_free_seqid(otl.open_seqid); - } - if (status == 0) - nfs_confirm_seqid(&lsp->ls_seqid, 0); - } else { - struct nfs_exist_lock el; - memcpy(&el.stateid, &lsp->ls_stateid, sizeof(el.stateid)); - largs.u.exist_lock = ⪙ - arg.u.lock = &largs; - el.seqid = lock_seqid; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - } - /* increment seqid on success, and seqid mutating errors*/ - nfs_increment_lock_seqid(status, lock_seqid); + /* increment open seqid on success, and seqid mutating errors */ + if (largs.new_lock_owner != 0) { + nfs_increment_open_seqid(status, largs.open_seqid); + if (status == 0) + nfs_confirm_seqid(&lsp->ls_seqid, 0); + } + nfs_free_seqid(largs.open_seqid); + } else + status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); + /* increment lock seqid on success, and seqid mutating errors*/ + nfs_increment_lock_seqid(status, largs.lock_seqid); /* save the returned stateid. */ if (status == 0) { - memcpy(lsp->ls_stateid.data, res.u.stateid.data, sizeof(lsp->ls_stateid.data)); + memcpy(lsp->ls_stateid.data, res.u.stateid.data, + sizeof(lsp->ls_stateid.data)); lsp->ls_flags |= NFS_LOCK_INITIALIZED; } else if (status == -NFS4ERR_DENIED) status = -EAGAIN; - nfs_free_seqid(lock_seqid); +out: + nfs_free_seqid(largs.lock_seqid); return status; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4706192cfb07..c5c75235c5b8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -729,22 +729,18 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg) WRITE64(arg->length); WRITE32(opargs->new_lock_owner); if (opargs->new_lock_owner){ - struct nfs_open_to_lock *ol = opargs->u.open_lock; - RESERVE_SPACE(40); - WRITE32(ol->open_seqid->sequence->counter); - WRITEMEM(&ol->open_stateid, sizeof(ol->open_stateid)); - WRITE32(ol->lock_seqid->sequence->counter); - WRITE64(ol->lock_owner.clientid); + WRITE32(opargs->open_seqid->sequence->counter); + WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data)); + WRITE32(opargs->lock_seqid->sequence->counter); + WRITE64(opargs->lock_owner.clientid); WRITE32(4); - WRITE32(ol->lock_owner.id); + WRITE32(opargs->lock_owner.id); } else { - struct nfs_exist_lock *el = opargs->u.exist_lock; - RESERVE_SPACE(20); - WRITEMEM(&el->stateid, sizeof(el->stateid)); - WRITE32(el->seqid->sequence->counter); + WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data)); + WRITE32(opargs->lock_seqid->sequence->counter); } return 0; @@ -1535,16 +1531,14 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_locka .nops = 2, }; struct nfs_lock_opargs *opargs = args->u.lock; - struct nfs_seqid *seqid; int status; - if (opargs->new_lock_owner) - seqid = opargs->u.open_lock->lock_seqid; - else - seqid = opargs->u.exist_lock->seqid; - status = nfs_wait_on_sequence(seqid, req->rq_task); + status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task); if (status != 0) goto out; + /* Do we need to do an open_to_lock_owner? */ + if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED) + opargs->new_lock_owner = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -2908,8 +2902,8 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res) status = decode_op_hdr(xdr, OP_LOCK); if (status == 0) { - READ_BUF(sizeof(nfs4_stateid)); - COPYMEM(&res->u.stateid, sizeof(res->u.stateid)); + READ_BUF(sizeof(res->u.stateid.data)); + COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data)); } else if (status == -NFS4ERR_DENIED) return decode_lock_denied(xdr, &res->u.denied); return status; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 849f95c5fae4..57efcc27f20b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -165,25 +165,14 @@ struct nfs_lowner { u32 id; }; -struct nfs_open_to_lock { - struct nfs_seqid * open_seqid; - nfs4_stateid open_stateid; - struct nfs_seqid * lock_seqid; - struct nfs_lowner lock_owner; -}; - -struct nfs_exist_lock { - nfs4_stateid stateid; - struct nfs_seqid * seqid; -}; - struct nfs_lock_opargs { + struct nfs_seqid * lock_seqid; + nfs4_stateid * lock_stateid; + struct nfs_seqid * open_seqid; + nfs4_stateid * open_stateid; + struct nfs_lowner lock_owner; __u32 reclaim; __u32 new_lock_owner; - union { - struct nfs_open_to_lock *open_lock; - struct nfs_exist_lock *exist_lock; - } u; }; struct nfs_locku_opargs { From 039c4d7a82d8268ec71f59679460b41d0dd9b225 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:16 -0700 Subject: [PATCH 048/126] NFS: Fix up a race in the NFS implementation of GETLK ...and fix a memory corruption bug due to improper use of memcpy() on a struct file_lock. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6bdcfa95de94..572d8593486f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -376,22 +376,31 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) { + struct file_lock *cfl; struct inode *inode = filp->f_mapping->host; int status = 0; lock_kernel(); - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) - status = NFS_PROTO(inode)->lock(filp, cmd, fl); - else { - struct file_lock *cfl = posix_test_lock(filp, fl); - - fl->fl_type = F_UNLCK; - if (cfl != NULL) - memcpy(fl, cfl, sizeof(*fl)); + /* Try local locking first */ + cfl = posix_test_lock(filp, fl); + if (cfl != NULL) { + locks_copy_lock(fl, cfl); + goto out; } + + if (nfs_have_delegation(inode, FMODE_READ)) + goto out_noconflict; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) + goto out_noconflict; + + status = NFS_PROTO(inode)->lock(filp, cmd, fl); +out: unlock_kernel(); return status; +out_noconflict: + fl->fl_type = F_UNLCK; + goto out; } static int do_vfs_lock(struct file *file, struct file_lock *fl) From 834f2a4a1554dc5b2598038b3fe8703defcbe467 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:16 -0700 Subject: [PATCH 049/126] VFS: Allow the filesystem to return a full file pointer on open intent This is needed by NFSv4 for atomicity reasons: our open command is in fact a lookup+open, so we need to be able to propagate open context information from lookup() into the resulting struct file's private_data field. Signed-off-by: Trond Myklebust --- fs/exec.c | 12 +++--- fs/namei.c | 93 +++++++++++++++++++++++++++++++++++++++---- fs/open.c | 79 +++++++++++++++++++++++++++++------- include/linux/namei.h | 8 ++++ 4 files changed, 165 insertions(+), 27 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a04a575ad433..d2208f7c87db 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -126,8 +126,7 @@ asmlinkage long sys_uselib(const char __user * library) struct nameidata nd; int error; - nd.intent.open.flags = FMODE_READ; - error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); + error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ); if (error) goto out; @@ -139,7 +138,7 @@ asmlinkage long sys_uselib(const char __user * library) if (error) goto exit; - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + file = nameidata_to_filp(&nd, O_RDONLY); error = PTR_ERR(file); if (IS_ERR(file)) goto out; @@ -167,6 +166,7 @@ asmlinkage long sys_uselib(const char __user * library) out: return error; exit: + release_open_intent(&nd); path_release(&nd); goto out; } @@ -490,8 +490,7 @@ struct file *open_exec(const char *name) int err; struct file *file; - nd.intent.open.flags = FMODE_READ; - err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); + err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ); file = ERR_PTR(err); if (!err) { @@ -504,7 +503,7 @@ struct file *open_exec(const char *name) err = -EACCES; file = ERR_PTR(err); if (!err) { - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + file = nameidata_to_filp(&nd, O_RDONLY); if (!IS_ERR(file)) { err = deny_write_access(file); if (err) { @@ -516,6 +515,7 @@ struct file *open_exec(const char *name) return file; } } + release_open_intent(&nd); path_release(&nd); } goto out; diff --git a/fs/namei.c b/fs/namei.c index aa62dbda93ac..0d1dff7d3d95 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -317,6 +318,18 @@ void path_release_on_umount(struct nameidata *nd) mntput_no_expire(nd->mnt); } +/** + * release_open_intent - free up open intent resources + * @nd: pointer to nameidata + */ +void release_open_intent(struct nameidata *nd) +{ + if (nd->intent.open.file->f_dentry == NULL) + put_filp(nd->intent.open.file); + else + fput(nd->intent.open.file); +} + /* * Internal lookup() using the new generic dcache. * SMP-safe @@ -1052,6 +1065,70 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata return retval; } +static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags, + struct nameidata *nd, int open_flags, int create_mode) +{ + struct file *filp = get_empty_filp(); + int err; + + if (filp == NULL) + return -ENFILE; + nd->intent.open.file = filp; + nd->intent.open.flags = open_flags; + nd->intent.open.create_mode = create_mode; + err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd); + if (IS_ERR(nd->intent.open.file)) { + if (err == 0) { + err = PTR_ERR(nd->intent.open.file); + path_release(nd); + } + } else if (err != 0) + release_open_intent(nd); + return err; +} + +/** + * path_lookup_open - lookup a file path with open intent + * @name: pointer to file name + * @lookup_flags: lookup intent flags + * @nd: pointer to nameidata + * @open_flags: open intent flags + */ +int path_lookup_open(const char *name, unsigned int lookup_flags, + struct nameidata *nd, int open_flags) +{ + return __path_lookup_intent_open(name, lookup_flags, nd, + open_flags, 0); +} + +/** + * path_lookup_create - lookup a file path with open + create intent + * @name: pointer to file name + * @lookup_flags: lookup intent flags + * @nd: pointer to nameidata + * @open_flags: open intent flags + * @create_mode: create intent flags + */ +int path_lookup_create(const char *name, unsigned int lookup_flags, + struct nameidata *nd, int open_flags, int create_mode) +{ + return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd, + open_flags, create_mode); +} + +int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags, + struct nameidata *nd, int open_flags) +{ + char *tmp = getname(name); + int err = PTR_ERR(tmp); + + if (!IS_ERR(tmp)) { + err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0); + putname(tmp); + } + return err; +} + /* * Restricted form of lookup. Doesn't follow links, single-component only, * needs parent already locked. Doesn't follow mounts. @@ -1416,27 +1493,27 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) */ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) { - int acc_mode, error = 0; + int acc_mode, error; struct path path; struct dentry *dir; int count = 0; acc_mode = ACC_MODE(flag); + /* O_TRUNC implies we need access checks for write permissions */ + if (flag & O_TRUNC) + acc_mode |= MAY_WRITE; + /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flag & O_APPEND) acc_mode |= MAY_APPEND; - /* Fill in the open() intent data */ - nd->intent.open.flags = flag; - nd->intent.open.create_mode = mode; - /* * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { - error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd); + error = path_lookup_open(pathname, lookup_flags(flag), nd, flag); if (error) return error; goto ok; @@ -1445,7 +1522,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) /* * Create - we need to know the parent. */ - error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); + error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode); if (error) return error; @@ -1520,6 +1597,8 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) exit_dput: dput_path(&path, nd); exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); path_release(nd); return error; diff --git a/fs/open.c b/fs/open.c index f0d90cf0495c..8d06ec911fd9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -739,7 +739,8 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) } static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct file *f) + int flags, struct file *f, + int (*open)(struct inode *, struct file *)) { struct inode *inode; int error; @@ -761,11 +762,14 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_op = fops_get(inode->i_fop); file_move(f, &inode->i_sb->s_files); - if (f->f_op && f->f_op->open) { - error = f->f_op->open(inode,f); + if (!open && f->f_op) + open = f->f_op->open; + if (open) { + error = open(inode, f); if (error) goto cleanup_all; } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); @@ -814,28 +818,75 @@ struct file *filp_open(const char * filename, int flags, int mode) { int namei_flags, error; struct nameidata nd; - struct file *f; namei_flags = flags; if ((namei_flags+1) & O_ACCMODE) namei_flags++; - if (namei_flags & O_TRUNC) - namei_flags |= 2; - - error = -ENFILE; - f = get_empty_filp(); - if (f == NULL) - return ERR_PTR(error); error = open_namei(filename, namei_flags, mode, &nd); if (!error) - return __dentry_open(nd.dentry, nd.mnt, flags, f); + return nameidata_to_filp(&nd, flags); - put_filp(f); return ERR_PTR(error); } EXPORT_SYMBOL(filp_open); +/** + * lookup_instantiate_filp - instantiates the open intent filp + * @nd: pointer to nameidata + * @dentry: pointer to dentry + * @open: open callback + * + * Helper for filesystems that want to use lookup open intents and pass back + * a fully instantiated struct file to the caller. + * This function is meant to be called from within a filesystem's + * lookup method. + * Note that in case of error, nd->intent.open.file is destroyed, but the + * path information remains valid. + * If the open callback is set to NULL, then the standard f_op->open() + * filesystem callback is substituted. + */ +struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, + int (*open)(struct inode *, struct file *)) +{ + if (IS_ERR(nd->intent.open.file)) + goto out; + if (IS_ERR(dentry)) + goto out_err; + nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->mnt), + nd->intent.open.flags - 1, + nd->intent.open.file, + open); +out: + return nd->intent.open.file; +out_err: + release_open_intent(nd); + nd->intent.open.file = (struct file *)dentry; + goto out; +} +EXPORT_SYMBOL_GPL(lookup_instantiate_filp); + +/** + * nameidata_to_filp - convert a nameidata to an open filp. + * @nd: pointer to nameidata + * @flags: open flags + * + * Note that this function destroys the original nameidata + */ +struct file *nameidata_to_filp(struct nameidata *nd, int flags) +{ + struct file *filp; + + /* Pick up the filp from the open intent */ + filp = nd->intent.open.file; + /* Has the filesystem initialised the file for us? */ + if (filp->f_dentry == NULL) + filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL); + else + path_release(nd); + return filp; +} + struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) { int error; @@ -846,7 +897,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) if (f == NULL) return ERR_PTR(error); - return __dentry_open(dentry, mnt, flags, f); + return __dentry_open(dentry, mnt, flags, f, NULL); } EXPORT_SYMBOL(dentry_open); diff --git a/include/linux/namei.h b/include/linux/namei.h index 7db67b008cac..1c975d0d9e94 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -8,6 +8,7 @@ struct vfsmount; struct open_intent { int flags; int create_mode; + struct file *file; }; enum { MAX_NESTED_LINKS = 5 }; @@ -65,6 +66,13 @@ extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); extern void path_release(struct nameidata *); extern void path_release_on_umount(struct nameidata *); +extern int __user_path_lookup_open(const char __user *, unsigned lookup_flags, struct nameidata *nd, int open_flags); +extern int path_lookup_open(const char *, unsigned lookup_flags, struct nameidata *, int open_flags); +extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, + int (*open)(struct inode *, struct file *)); +extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); +extern void release_open_intent(struct nameidata *); + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); extern struct dentry * lookup_hash(struct qstr *, struct dentry *); From 02a913a73b52071e93f4b76db3e86138d19efffd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:17 -0700 Subject: [PATCH 050/126] NFSv4: Eliminate nfsv4 open race... Make NFSv4 return the fully initialized file pointer with the stateid that it created in the lookup w/intent. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 29 ++++----- fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4_fs.h | 4 +- fs/nfs/nfs4proc.c | 137 ++++++++++++++++------------------------ fs/nfs/proc.c | 2 +- include/linux/nfs_xdr.h | 2 +- 6 files changed, 74 insertions(+), 102 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c70eabd6d179..a6e251f21fd8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -914,7 +914,6 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd) static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct dentry *res = NULL; - struct inode *inode = NULL; int error; /* Check that we are indeed trying to open this file */ @@ -928,8 +927,10 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry dentry->d_op = NFS_PROTO(dir)->dentry_ops; /* Let vfs_create() deal with O_EXCL */ - if (nd->intent.open.flags & O_EXCL) - goto no_entry; + if (nd->intent.open.flags & O_EXCL) { + d_add(dentry, NULL); + goto out; + } /* Open the file on the server */ lock_kernel(); @@ -943,18 +944,18 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry if (nd->intent.open.flags & O_CREAT) { nfs_begin_data_update(dir); - inode = nfs4_atomic_open(dir, dentry, nd); + res = nfs4_atomic_open(dir, dentry, nd); nfs_end_data_update(dir); } else - inode = nfs4_atomic_open(dir, dentry, nd); + res = nfs4_atomic_open(dir, dentry, nd); unlock_kernel(); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); + if (IS_ERR(res)) { + error = PTR_ERR(res); switch (error) { /* Make a negative dentry */ case -ENOENT: - inode = NULL; - break; + res = NULL; + goto out; /* This turned out not to be a regular file */ case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) @@ -962,13 +963,9 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry /* case -EISDIR: */ /* case -EINVAL: */ default: - res = ERR_PTR(error); goto out; } - } -no_entry: - res = d_add_unique(dentry, inode); - if (res != NULL) + } else if (res != NULL) dentry = res; nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -1012,7 +1009,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) */ lock_kernel(); verifier = nfs_save_change_attribute(dir); - ret = nfs4_open_revalidate(dir, dentry, openflags); + ret = nfs4_open_revalidate(dir, dentry, openflags, nd); if (!ret) nfs_set_verifier(dentry, verifier); unlock_kernel(); @@ -1135,7 +1132,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, lock_kernel(); nfs_begin_data_update(dir); - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); nfs_end_data_update(dir); if (error != 0) goto out_err; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index edc95514046d..df80477c5af4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -299,7 +299,7 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata) */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags) + int flags, struct nameidata *nd) { struct nfs_fh fhandle; struct nfs_fattr fattr; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 2215cdee43ae..8a3788199052 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -215,8 +215,8 @@ extern int nfs4_proc_setclientid_confirm(struct nfs4_client *); extern int nfs4_proc_async_renew(struct nfs4_client *); extern int nfs4_proc_renew(struct nfs4_client *); extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode); -extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); -extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); +extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); +extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 35da15342e05..c9ecb8119632 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "nfs4_fs.h" #include "delegation.h" @@ -947,12 +948,26 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) return status; } -struct inode * +static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +{ + struct file *filp; + + filp = lookup_instantiate_filp(nd, dentry, NULL); + if (!IS_ERR(filp)) { + struct nfs_open_context *ctx; + ctx = (struct nfs_open_context *)filp->private_data; + ctx->state = state; + } else + nfs4_close_state(state, nd->intent.open.flags); +} + +struct dentry * nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct iattr attr; struct rpc_cred *cred; struct nfs4_state *state; + struct dentry *res; if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; @@ -966,16 +981,23 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); if (IS_ERR(cred)) - return (struct inode *)cred; + return (struct dentry *)cred; state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); put_rpccred(cred); - if (IS_ERR(state)) - return (struct inode *)state; - return state->inode; + if (IS_ERR(state)) { + if (PTR_ERR(state) == -ENOENT) + d_add(dentry, NULL); + return (struct dentry *)state; + } + res = d_add_unique(dentry, state->inode); + if (res != NULL) + dentry = res; + nfs4_intent_set_file(nd, dentry, state); + return res; } int -nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags) +nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) { struct rpc_cred *cred; struct nfs4_state *state; @@ -988,18 +1010,30 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags) if (IS_ERR(state)) state = nfs4_do_open(dir, dentry, openflags, NULL, cred); put_rpccred(cred); - if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0) - return 1; - if (IS_ERR(state)) - return 0; + if (IS_ERR(state)) { + switch (PTR_ERR(state)) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + lookup_instantiate_filp(nd, (struct dentry *)state, NULL); + return 1; + case -ENOENT: + if (dentry->d_inode == NULL) + return 1; + } + goto out_drop; + } inode = state->inode; + iput(inode); if (inode == dentry->d_inode) { - iput(inode); + nfs4_intent_set_file(nd, dentry, state); return 1; } - d_drop(dentry); nfs4_close_state(state, openflags); - iput(inode); +out_drop: + d_drop(dentry); return 0; } @@ -1500,7 +1534,7 @@ static int nfs4_proc_commit(struct nfs_write_data *cdata) static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags) + int flags, struct nameidata *nd) { struct nfs4_state *state; struct rpc_cred *cred; @@ -1522,13 +1556,13 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs_fattr fattr; status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, NFS_FH(state->inode), sattr, state); - if (status == 0) { + if (status == 0) nfs_setattr_update_inode(state->inode, sattr); - goto out; - } - } else if (flags != 0) - goto out; - nfs4_close_state(state, flags); + } + if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) + nfs4_intent_set_file(nd, dentry, state); + else + nfs4_close_state(state, flags); out: return status; } @@ -2175,65 +2209,6 @@ nfs4_proc_renew(struct nfs4_client *clp) return 0; } -/* - * We will need to arrange for the VFS layer to provide an atomic open. - * Until then, this open method is prone to inefficiency and race conditions - * due to the lookup, potential create, and open VFS calls from sys_open() - * placed on the wire. - */ -static int -nfs4_proc_file_open(struct inode *inode, struct file *filp) -{ - struct dentry *dentry = filp->f_dentry; - struct nfs_open_context *ctx; - struct nfs4_state *state = NULL; - struct rpc_cred *cred; - int status = -ENOMEM; - - dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n", - (int)dentry->d_parent->d_name.len, - dentry->d_parent->d_name.name, - (int)dentry->d_name.len, dentry->d_name.name); - - - /* Find our open stateid */ - cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); - if (IS_ERR(cred)) - return PTR_ERR(cred); - ctx = alloc_nfs_open_context(dentry, cred); - put_rpccred(cred); - if (unlikely(ctx == NULL)) - return -ENOMEM; - status = -EIO; /* ERACE actually */ - state = nfs4_find_state(inode, cred, filp->f_mode); - if (unlikely(state == NULL)) - goto no_state; - ctx->state = state; - nfs4_close_state(state, filp->f_mode); - ctx->mode = filp->f_mode; - nfs_file_set_open_context(filp, ctx); - put_nfs_open_context(ctx); - if (filp->f_mode & FMODE_WRITE) - nfs_begin_data_update(inode); - return 0; -no_state: - printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__); - put_nfs_open_context(ctx); - return status; -} - -/* - * Release our state - */ -static int -nfs4_proc_file_release(struct inode *inode, struct file *filp) -{ - if (filp->f_mode & FMODE_WRITE) - nfs_end_data_update(inode); - nfs_file_clear_open_context(filp); - return 0; -} - static inline int nfs4_server_supports_acls(struct nfs_server *server) { return (server->caps & NFS_CAP_ACLS) @@ -3145,8 +3120,8 @@ struct nfs_rpc_ops nfs_v4_clientops = { .read_setup = nfs4_proc_read_setup, .write_setup = nfs4_proc_write_setup, .commit_setup = nfs4_proc_commit_setup, - .file_open = nfs4_proc_file_open, - .file_release = nfs4_proc_file_release, + .file_open = nfs_open, + .file_release = nfs_release, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, }; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index be23c3fb9260..8fef86523d7f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -216,7 +216,7 @@ static int nfs_proc_write(struct nfs_write_data *wdata) static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags) + int flags, struct nameidata *nd) { struct nfs_fh fhandle; struct nfs_fattr fattr; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 57efcc27f20b..60086dac11d5 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -712,7 +712,7 @@ struct nfs_rpc_ops { int (*write) (struct nfs_write_data *); int (*commit) (struct nfs_write_data *); int (*create) (struct inode *, struct dentry *, - struct iattr *, int); + struct iattr *, int, struct nameidata *); int (*remove) (struct inode *, struct qstr *); int (*unlink_setup) (struct rpc_message *, struct dentry *, struct qstr *); From 6f926b5ba75c568296ec227e7d782db4ddbdca5c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:18 -0700 Subject: [PATCH 051/126] [NFS]: Check that the server returns a valid regular file to our OPEN request Since it appears that some servers don't... Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 +++- fs/nfs/nfs4proc.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a6e251f21fd8..ac331d2d4c4a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -957,10 +957,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: + case -ENOTDIR: + goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c9ecb8119632..3db1c9f0b09c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -419,6 +419,22 @@ static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, stru o_arg->clientid = sp->so_client->cl_clientid; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); + if (status == 0) { + /* OPEN on anything except a regular file is disallowed in NFSv4 */ + switch (o_res->f_attr->mode & S_IFMT) { + case S_IFREG: + break; + case S_IFLNK: + status = -ELOOP; + break; + case S_IFDIR: + status = -EISDIR; + break; + default: + status = -ENOTDIR; + } + } + nfs_increment_open_seqid(status, o_arg->seqid); if (status != 0) goto out; From cdce5d6b94b6182f6d8a5b7b52923933e98cbc92 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:18 -0700 Subject: [PATCH 052/126] VFS: Make link_path_walk set LOOKUP_CONTINUE before calling permission(). This will allow nfs_permission() to perform additional optimizations when walking the path, by folding the ACCESS(MAY_EXEC) call on the directory into the lookup revalidation. Signed-off-by: Trond Myklebust --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 0d1dff7d3d95..aaaa81036234 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -763,6 +763,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) struct qstr this; unsigned int c; + nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode, nd); if (err == -EAGAIN) { err = permission(inode, MAY_EXEC, nd); @@ -815,7 +816,6 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) if (err < 0) break; } - nd->flags |= LOOKUP_CONTINUE; /* This does the actual lookups.. */ err = do_lookup(nd, &this, &next); if (err) From cae7a073a4c5484cc5713eab606bf54b46724ab3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:19 -0700 Subject: [PATCH 053/126] NFSv4: Return delegation upon rename or removal of file. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- fs/nfs/delegation.h | 16 +++++++++++++++- fs/nfs/dir.c | 3 +++ fs/nfs/inode.c | 3 +-- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 4a36839f0bbd..44135af9894c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -142,7 +142,7 @@ static void nfs_msync_inode(struct inode *inode) /* * Basic procedure for returning a delegation to the server */ -int nfs_inode_return_delegation(struct inode *inode) +int __nfs_inode_return_delegation(struct inode *inode) { struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; struct nfs_inode *nfsi = NFS_I(inode); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 3f6c45a29d6a..8017846b561f 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -25,7 +25,7 @@ struct nfs_delegation { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -int nfs_inode_return_delegation(struct inode *inode); +int __nfs_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); @@ -47,11 +47,25 @@ static inline int nfs_have_delegation(struct inode *inode, int flags) return 1; return 0; } + +static inline int nfs_inode_return_delegation(struct inode *inode) +{ + int err = 0; + + if (NFS_I(inode)->delegation != NULL) + err = __nfs_inode_return_delegation(inode); + return err; +} #else static inline int nfs_have_delegation(struct inode *inode, int flags) { return 0; } + +static inline int nfs_inode_return_delegation(struct inode *inode) +{ + return 0; +} #endif #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ac331d2d4c4a..72f50c0117b1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -801,6 +801,7 @@ static int nfs_dentry_delete(struct dentry *dentry) */ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { + nfs_inode_return_delegation(inode); if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { lock_kernel(); inode->i_nlink--; @@ -1329,6 +1330,7 @@ static int nfs_safe_remove(struct dentry *dentry) nfs_begin_data_update(dir); if (inode != NULL) { + nfs_inode_return_delegation(inode); nfs_begin_data_update(inode); error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); /* The VFS may want to delete this inode */ @@ -1547,6 +1549,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, nfs_wb_all(old_inode); shrink_dcache_parent(old_dentry); } + nfs_inode_return_delegation(old_inode); if (new_inode) d_delete(new_dentry); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d7abaa00473a..6b0f7fe6bd1d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1671,8 +1671,7 @@ static void nfs4_clear_inode(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); /* If we are holding a delegation, return it! */ - if (nfsi->delegation != NULL) - nfs_inode_return_delegation(inode); + nfs_inode_return_delegation(inode); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); /* Now clear out any remaining state */ From 642ac54923e0291ae2c8e42edde18d8c9c0a3c84 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:19 -0700 Subject: [PATCH 054/126] NFSv4: Return delegations in case we're changing ACLs Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 +++++ fs/nfs/nfs4proc.c | 1 + 2 files changed, 6 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6b0f7fe6bd1d..65d5ab45ddc5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -853,6 +853,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) filemap_fdatawait(inode->i_mapping); nfs_wb_all(inode); } + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs_inode_return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); if (error == 0) nfs_refresh_inode(inode, &fattr); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3db1c9f0b09c..c10dcd12af51 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2405,6 +2405,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; + nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); if (ret == 0) From b8e5c4c2978fa666287525a9de2fa648a7581262 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:20 -0700 Subject: [PATCH 055/126] NFSv4: If a delegated open fails, ensure that we return the delegation Unless of course the open fails due to permission issues. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c10dcd12af51..a8be9af610ac 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -658,6 +658,8 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred } up_read(&nfsi->rwsem); up_read(&clp->cl_sem); + if (err != -EACCES) + nfs_inode_return_delegation(inode); return err; } From 550f57470c6506f5ef3c708335dea6bd48bf3dc4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:21 -0700 Subject: [PATCH 056/126] NFSv4: Ensure that we recover from the OPEN + OPEN_CONFIRM BAD_STATEID race If the server is in the unconfirmed OPEN state for a given open owner and receives a second OPEN for the same open owner, it will cancel the state of the first request and set up an OPEN_CONFIRM for the second. This can cause a race that is discussed in rfc3530 on page 181. The following patch allows the client to recover by retrying the original open request. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a8be9af610ac..ff378126ccd7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -785,6 +785,16 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, exception.retry = 1; continue; } + /* + * BAD_STATEID on OPEN means that the server cancelled our + * state before it received the OPEN_CONFIRM. + * Recover by retrying the request as per the discussion + * on Page 181 of RFC3530. + */ + if (status == -NFS4ERR_BAD_STATEID) { + exception.retry = 1; + continue; + } res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), status, &exception)); } while (exception.retry); From 4c780a4688b421baa896b59778c05d7e068e479f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:21 -0700 Subject: [PATCH 057/126] Fix Connectathon locking test failure We currently fail Connectathon test 6.10 in the case of 32-bit locks due to incorrect error checking. Also add support for l->l_len < 0 to 64-bit locks. Signed-off-by: Trond Myklebust --- fs/locks.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 7eb1d77b9204..a1e8b2248014 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -316,21 +316,22 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, /* POSIX-1996 leaves the case l->l_len < 0 undefined; POSIX-2001 defines it. */ start += l->l_start; - end = start + l->l_len - 1; - if (l->l_len < 0) { - end = start - 1; - start += l->l_len; - } - if (start < 0) return -EINVAL; - if (l->l_len > 0 && end < 0) - return -EOVERFLOW; - + fl->fl_end = OFFSET_MAX; + if (l->l_len > 0) { + end = start + l->l_len - 1; + fl->fl_end = end; + } else if (l->l_len < 0) { + end = start - 1; + fl->fl_end = end; + start += l->l_len; + if (start < 0) + return -EINVAL; + } fl->fl_start = start; /* we record the absolute position */ - fl->fl_end = end; - if (l->l_len == 0) - fl->fl_end = OFFSET_MAX; + if (fl->fl_end < fl->fl_start) + return -EOVERFLOW; fl->fl_owner = current->files; fl->fl_pid = current->tgid; @@ -362,14 +363,21 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, return -EINVAL; } - if (((start += l->l_start) < 0) || (l->l_len < 0)) + start += l->l_start; + if (start < 0) return -EINVAL; - fl->fl_end = start + l->l_len - 1; - if (l->l_len > 0 && fl->fl_end < 0) - return -EOVERFLOW; + fl->fl_end = OFFSET_MAX; + if (l->l_len > 0) { + fl->fl_end = start + l->l_len - 1; + } else if (l->l_len < 0) { + fl->fl_end = start - 1; + start += l->l_len; + if (start < 0) + return -EINVAL; + } fl->fl_start = start; /* we record the absolute position */ - if (l->l_len == 0) - fl->fl_end = OFFSET_MAX; + if (fl->fl_end < fl->fl_start) + return -EOVERFLOW; fl->fl_owner = current->files; fl->fl_pid = current->tgid; From 6fe43f9e3701f7a9f2be151a5e6cfe94b87e92f9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:22 -0700 Subject: [PATCH 058/126] NFS: Fix rename of directory onto empty directory If someone tries to rename a directory onto an empty directory, we currently fail and return EBUSY. This patch ensures that we try the rename if both source and target are directories, and that we fail with a correct error of EISDIR if the source is not a directory. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 72f50c0117b1..eb50c19fc253 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1511,9 +1511,11 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ if (!new_inode) goto go_ahead; - if (S_ISDIR(new_inode->i_mode)) - goto out; - else if (atomic_read(&new_dentry->d_count) > 2) { + if (S_ISDIR(new_inode->i_mode)) { + error = -EISDIR; + if (!S_ISDIR(old_inode->i_mode)) + goto out; + } else if (atomic_read(&new_dentry->d_count) > 2) { int err; /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, From 7f709a48fa798cfa0f2f777c8752e12995054f78 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 23:19:39 -0700 Subject: [PATCH 059/126] NFSv4: Fix an oopsable condition in nfs_free_seqid Storing a pointer to the struct rpc_task in the nfs_seqid is broken since the nfs_seqid may be freed well after the task has been destroyed. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 - fs/nfs/nfs4state.c | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8a3788199052..45bff1d1a513 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -112,7 +112,6 @@ struct nfs_seqid_counter { struct nfs_seqid { struct list_head list; struct nfs_seqid_counter *sequence; - struct rpc_task *task; }; static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 23834c8fb740..da0861db57fb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -676,7 +676,6 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) new = kmalloc(sizeof(*new), GFP_KERNEL); if (new != NULL) { new->sequence = counter; - new->task = NULL; spin_lock(&sequence->lock); list_add_tail(&new->list, &sequence->list); spin_unlock(&sequence->lock); @@ -687,15 +686,10 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) void nfs_free_seqid(struct nfs_seqid *seqid) { struct rpc_sequence *sequence = seqid->sequence->sequence; - struct rpc_task *next = NULL; spin_lock(&sequence->lock); list_del(&seqid->list); - if (!list_empty(&sequence->list)) { - next = list_entry(sequence->list.next, struct nfs_seqid, list)->task; - if (next) - rpc_wake_up_task(next); - } + rpc_wake_up(&sequence->wait); spin_unlock(&sequence->lock); kfree(seqid); } @@ -754,7 +748,6 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) spin_lock(&sequence->lock); if (sequence->list.next != &seqid->list) { - seqid->task = task; rpc_sleep_on(&sequence->wait, task, NULL, NULL); status = -EAGAIN; } From 747c5534c9a6da4aa87e7cdc2209ea98ea27f381 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 18 Oct 2005 23:19:40 -0700 Subject: [PATCH 060/126] RPC: stops the release_pipe() funtion from being called twice This patch stops the release_pipe() funtion from being called twice by invalidating the ops pointer in the rpc_inode when rpc_pipe_release() is called. Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11ec..649d609e7d94 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -177,6 +177,8 @@ rpc_pipe_release(struct inode *inode, struct file *filp) __rpc_purge_upcall(inode, -EPIPE); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); + if (!rpci->nreaders && !rpci->nwriters) + rpci->ops = NULL; out: up(&inode->i_sem); return 0; From 8c233cf9c2ad6f49df753bdce84fddbf00bf6a75 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:27 -0400 Subject: [PATCH 061/126] NFSv4: handle no acl attr Stop handing garbage to userspace in the case where a weird server clears the acl bit in the getattr return (despite the fact that they've already claimed acl support.) Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c5c75235c5b8..cd762648fa9a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3255,7 +3255,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if (attrlen <= *acl_len) xdr_read_pages(xdr, attrlen); *acl_len = attrlen; - } + } else + status = -EOPNOTSUPP; out: return status; From 1d95db8e1688ed54e143a597c5570631a42fa594 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:32 -0400 Subject: [PATCH 062/126] NFSv4: Fix acl buffer size resp_len is passed in as buffer size to decode routine; make sure it's set right in case where userspace provides less than a page's worth of buffer. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff378126ccd7..2d9357cf3fc5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2357,7 +2357,7 @@ static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size return -ENOMEM; args.acl_pages[0] = localpage; args.acl_pgbase = 0; - args.acl_len = PAGE_SIZE; + resp_len = args.acl_len = PAGE_SIZE; } else { resp_buf = buf; buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); From 293f1eb551a77fe5c8956a559a3c0baea95cd9bc Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:37 -0400 Subject: [PATCH 063/126] SUNRPC: Add support for privacy to generic gss-api code. Add support for privacy to generic gss-api code. This is dead code until we have both a mechanism that supports privacy and code in the client or server that uses it. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/gss_api.h | 22 ++++++++++++++++++++++ net/sunrpc/auth_gss/gss_mech_switch.c | 22 ++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index 689262f63059..e896752ffbf9 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -48,6 +48,17 @@ u32 gss_verify_mic( struct xdr_buf *message, struct xdr_netobj *mic_token, u32 *qstate); +u32 gss_wrap( + struct gss_ctx *ctx_id, + u32 qop, + int offset, + struct xdr_buf *outbuf, + struct page **inpages); +u32 gss_unwrap( + struct gss_ctx *ctx_id, + u32 *qop, + int offset, + struct xdr_buf *inbuf); u32 gss_delete_sec_context( struct gss_ctx **ctx_id); @@ -93,6 +104,17 @@ struct gss_api_ops { struct xdr_buf *message, struct xdr_netobj *mic_token, u32 *qstate); + u32 (*gss_wrap)( + struct gss_ctx *ctx_id, + u32 qop, + int offset, + struct xdr_buf *outbuf, + struct page **inpages); + u32 (*gss_unwrap)( + struct gss_ctx *ctx_id, + u32 *qop, + int offset, + struct xdr_buf *buf); void (*gss_delete_sec_context)( void *internal_ctx_id); }; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 58aeaddd8c79..06d97cb3481a 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -276,6 +276,28 @@ gss_verify_mic(struct gss_ctx *context_handle, qstate); } +u32 +gss_wrap(struct gss_ctx *ctx_id, + u32 qop, + int offset, + struct xdr_buf *buf, + struct page **inpages) +{ + return ctx_id->mech_type->gm_ops + ->gss_wrap(ctx_id, qop, offset, buf, inpages); +} + +u32 +gss_unwrap(struct gss_ctx *ctx_id, + u32 *qop, + int offset, + struct xdr_buf *buf) +{ + return ctx_id->mech_type->gm_ops + ->gss_unwrap(ctx_id, qop, offset, buf); +} + + /* gss_delete_sec_context: free all resources associated with context_handle. * Note this differs from the RFC 2744-specified prototype in that we don't * bother returning an output token, since it would never be used anyway. */ From ead5e1c26fdcd969cf40c49cb0589d56879d240d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:43 -0400 Subject: [PATCH 064/126] SUNRPC: Provide a callback to allow free pages allocated during xdr encoding For privacy, we need to allocate pages to store the encrypted data (passed in pages can't be used without the risk of corrupting data in the page cache). So we need a way to free that memory after the request has been transmitted. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 5 ++++- net/sunrpc/xprt.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 068e1fb0868b..3b8b6e823c70 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -73,7 +73,10 @@ struct rpc_rqst { int rq_cong; /* has incremented xprt->cong */ int rq_received; /* receive completed */ u32 rq_seqno; /* gss seq no. used on req. */ - + int rq_enc_pages_num; + struct page **rq_enc_pages; /* scratch pages for use by + gss privacy code */ + void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ struct list_head rq_list; struct xdr_buf rq_private_buf; /* The receive buffer diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1ba55dc38b7a..6dda3860351f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -839,6 +839,7 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_task = task; req->rq_xprt = xprt; req->rq_xid = xprt_alloc_xid(xprt); + req->rq_release_snd_buf = NULL; dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, ntohl(req->rq_xid)); } @@ -867,6 +868,8 @@ void xprt_release(struct rpc_task *task) xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); memset(req, 0, sizeof(*req)); /* mark unused */ dprintk("RPC: %4d release request %p\n", task->tk_pid, req); From f3680312a737355ddf35c1b68af25e384d7ef0a8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:48 -0400 Subject: [PATCH 065/126] SUNRPC: Retry wrap in case of memory allocation failure. For privacy we need to allocate extra pages to hold encrypted page data when wrapping requests. This allocation may fail, and we handle that case by waiting and retrying. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 534274056329..702ede309b06 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -710,9 +710,16 @@ call_encode(struct rpc_task *task) rpc_exit(task, -EIO); return; } - if (encode != NULL) - task->tk_status = rpcauth_wrap_req(task, encode, req, p, - task->tk_msg.rpc_argp); + if (encode == NULL) + return; + + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); + if (task->tk_status == -ENOMEM) { + /* XXX: Is this sane? */ + rpc_delay(task, 3*HZ); + task->tk_status = -EAGAIN; + } } /* From 24b2605becc10ca63c4c30808fa59a8abbf68727 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:53 -0400 Subject: [PATCH 066/126] RPCSEC_GSS: cleanup au_rslack calculation Various xdr encode routines use au_rslack to guess where the reply argument will end up, so we can set up the xdr_buf to recieve data into the right place for zero copy. Currently we calculate the au_rslack estimate when we check the verifier. Normally this only depends on the verifier size. In the integrity case we add a few bytes to allow for a length and sequence number. It's a bit simpler to calculate only the verifier size when we check the verifier, and delay the full calculation till we unwrap. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 7 ++++++- net/sunrpc/auth_gss/auth_gss.c | 20 ++++++-------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 04ebc24db348..b68c11a2d6dd 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -66,7 +66,12 @@ struct rpc_cred_cache { struct rpc_auth { unsigned int au_cslack; /* call cred size estimate */ - unsigned int au_rslack; /* reply verf size guess */ + /* guess at number of u32's auth adds before + * reply data; normally the verifier size: */ + unsigned int au_rslack; + /* for gss, used to calculate au_rslack: */ + unsigned int au_verfsize; + unsigned int au_flags; /* various flags */ struct rpc_authops * au_ops; /* operations */ rpc_authflavor_t au_flavor; /* pseudoflavor (note may diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d2b08f16c257..dc95b797ca65 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -886,8 +886,6 @@ static u32 * gss_validate(struct rpc_task *task, u32 *p) { struct rpc_cred *cred = task->tk_msg.rpc_cred; - struct gss_cred *gss_cred = container_of(cred, struct gss_cred, - gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 seq, qop_state; struct kvec iov; @@ -915,18 +913,9 @@ gss_validate(struct rpc_task *task, u32 *p) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat) goto out_bad; - switch (gss_cred->gc_service) { - case RPC_GSS_SVC_NONE: - /* verifier data, flavor, length: */ - task->tk_auth->au_rslack = XDR_QUADLEN(len) + 2; - break; - case RPC_GSS_SVC_INTEGRITY: - /* verifier data, flavor, length, length, sequence number: */ - task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; - break; - case RPC_GSS_SVC_PRIVACY: - goto out_bad; - } + /* We leave it to unwrap to calculate au_rslack. For now we just + * calculate the length of the verifier: */ + task->tk_auth->au_verfsize = XDR_QUADLEN(len) + 2; gss_put_ctx(ctx); dprintk("RPC: %4u GSS gss_validate: gss_verify_mic succeeded.\n", task->tk_pid); @@ -1067,6 +1056,7 @@ gss_unwrap_resp(struct rpc_task *task, struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 *savedp = p; int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) @@ -1082,6 +1072,8 @@ gss_unwrap_resp(struct rpc_task *task, case RPC_GSS_SVC_PRIVACY: break; } + /* take into account extra slack for integrity and privacy cases: */ + task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp); out_decode: status = decode(rqstp, p, obj); out: From 2d2da60c63b67174add32f06e8d54c3a0c5cd9cf Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:58 -0400 Subject: [PATCH 067/126] RPCSEC_GSS: client-side privacy support Add the code to the client side to handle privacy. This is dead code until we actually add privacy support to krb5. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 149 ++++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dc95b797ca65..5e4872058ec7 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -975,6 +976,114 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, return 0; } +static void +priv_release_snd_buf(struct rpc_rqst *rqstp) +{ + int i; + + for (i=0; i < rqstp->rq_enc_pages_num; i++) + __free_page(rqstp->rq_enc_pages[i]); + kfree(rqstp->rq_enc_pages); +} + +static int +alloc_enc_pages(struct rpc_rqst *rqstp) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + int first, last, i; + + if (snd_buf->page_len == 0) { + rqstp->rq_enc_pages_num = 0; + return 0; + } + + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; + rqstp->rq_enc_pages_num = last - first + 1 + 1; + rqstp->rq_enc_pages + = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), + GFP_NOFS); + if (!rqstp->rq_enc_pages) + goto out; + for (i=0; i < rqstp->rq_enc_pages_num; i++) { + rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); + if (rqstp->rq_enc_pages[i] == NULL) + goto out_free; + } + rqstp->rq_release_snd_buf = priv_release_snd_buf; + return 0; +out_free: + for (i--; i >= 0; i--) { + __free_page(rqstp->rq_enc_pages[i]); + } +out: + return -EAGAIN; +} + +static inline int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + u32 offset; + u32 maj_stat; + int status; + u32 *opaque_len; + struct page **inpages; + int first; + int pad; + struct kvec *iov; + char *tmp; + + opaque_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) + return status; + + status = alloc_enc_pages(rqstp); + if (status) + return status; + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; + /* Give the tail its own page, in case we need extra space in the + * head when wrapping: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset, + snd_buf, inpages); + /* RPC_SLACK_SPACE should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was + * done anyway, so it's safe to put the request on the wire: */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else if (maj_stat) + return status; + + *opaque_len = htonl(snd_buf->len - offset); + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + pad = 3 - ((snd_buf->len - offset - 1) & 3); + memset(p, 0, pad); + iov->iov_len += pad; + snd_buf->len += pad; + + return 0; +} + static int gss_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, u32 *p, void *obj) @@ -1002,6 +1111,8 @@ gss_wrap_req(struct rpc_task *task, rqstp, p, obj); break; case RPC_GSS_SVC_PRIVACY: + status = gss_wrap_req_priv(cred, ctx, encode, + rqstp, p, obj); break; } out: @@ -1048,6 +1159,36 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, return 0; } +static inline int +gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, u32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + u32 offset; + u32 opaque_len; + u32 maj_stat; + int status = -EIO; + + opaque_len = ntohl(*(*p)++); + offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + if (offset + opaque_len > rcv_buf->len) + return status; + /* remove padding: */ + rcv_buf->len = offset + opaque_len; + + maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL, + offset, rcv_buf); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat != GSS_S_COMPLETE) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + return 0; +} + + static int gss_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, u32 *p, void *obj) @@ -1057,6 +1198,8 @@ gss_unwrap_resp(struct rpc_task *task, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 *savedp = p; + struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head; + int savedlen = head->iov_len; int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) @@ -1070,10 +1213,14 @@ gss_unwrap_resp(struct rpc_task *task, goto out; break; case RPC_GSS_SVC_PRIVACY: + status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p); + if (status) + goto out; break; } /* take into account extra slack for integrity and privacy cases: */ - task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp); + task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp) + + (savedlen - head->iov_len); out_decode: status = decode(rqstp, p, obj); out: From f7b3af64c653c73feb060a9f94f2df9ab4bba4c3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:03 -0400 Subject: [PATCH 068/126] RPCSEC_GSS: Simplify rpcsec_gss crypto code Factor out some code that will be shared by privacy crypto routines Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 106 +++++++++++++++++++------- 1 file changed, 77 insertions(+), 29 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index ee6ae74cd1b2..2baf93f8b8f5 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -139,6 +139,82 @@ buf_to_sg(struct scatterlist *sg, char *ptr, int len) { sg->length = len; } +static int +process_xdr_buf(struct xdr_buf *buf, int offset, int len, + int (*actor)(struct scatterlist *, void *), void *data) +{ + int i, page_len, thislen, page_offset, ret = 0; + struct scatterlist sg[1]; + + if (offset >= buf->head[0].iov_len) { + offset -= buf->head[0].iov_len; + } else { + thislen = buf->head[0].iov_len - offset; + if (thislen > len) + thislen = len; + buf_to_sg(sg, buf->head[0].iov_base + offset, thislen); + ret = actor(sg, data); + if (ret) + goto out; + offset = 0; + len -= thislen; + } + if (len == 0) + goto out; + + if (offset >= buf->page_len) { + offset -= buf->page_len; + } else { + page_len = buf->page_len - offset; + if (page_len > len) + page_len = len; + len -= page_len; + page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); + i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; + thislen = PAGE_CACHE_SIZE - page_offset; + do { + if (thislen > page_len) + thislen = page_len; + sg->page = buf->pages[i]; + sg->offset = page_offset; + sg->length = thislen; + ret = actor(sg, data); + if (ret) + goto out; + page_len -= thislen; + i++; + page_offset = 0; + thislen = PAGE_CACHE_SIZE; + } while (page_len != 0); + offset = 0; + } + if (len == 0) + goto out; + + if (offset < buf->tail[0].iov_len) { + thislen = buf->tail[0].iov_len - offset; + if (thislen > len) + thislen = len; + buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen); + ret = actor(sg, data); + len -= thislen; + } + if (len != 0) + ret = -EINVAL; +out: + return ret; +} + +static int +checksummer(struct scatterlist *sg, void *data) +{ + struct crypto_tfm *tfm = (struct crypto_tfm *)data; + + crypto_digest_update(tfm, sg, 1); + + return 0; +} + /* checksum the plaintext data and hdrlen bytes of the token header */ s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, @@ -148,8 +224,6 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ struct scatterlist sg[1]; u32 code = GSS_S_FAILURE; - int len, thislen, offset; - int i; switch (cksumtype) { case CKSUMTYPE_RSA_MD5: @@ -169,33 +243,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, crypto_digest_init(tfm); buf_to_sg(sg, header, hdrlen); crypto_digest_update(tfm, sg, 1); - if (body->head[0].iov_len) { - buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); - crypto_digest_update(tfm, sg, 1); - } - - len = body->page_len; - if (len != 0) { - offset = body->page_base & (PAGE_CACHE_SIZE - 1); - i = body->page_base >> PAGE_CACHE_SHIFT; - thislen = PAGE_CACHE_SIZE - offset; - do { - if (thislen > len) - thislen = len; - sg->page = body->pages[i]; - sg->offset = offset; - sg->length = thislen; - crypto_digest_update(tfm, sg, 1); - len -= thislen; - i++; - offset = 0; - thislen = PAGE_CACHE_SIZE; - } while(len != 0); - } - if (body->tail[0].iov_len) { - buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); - crypto_digest_update(tfm, sg, 1); - } + process_xdr_buf(body, 0, body->len, checksummer, tfm); crypto_digest_final(tfm, cksum->data); code = 0; out: From bfa91516b57483fc9c81d8d90325fd2c3c16ac48 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:08 -0400 Subject: [PATCH 069/126] RPCSEC_GSS: krb5 pre-privacy cleanup The code this was originally derived from processed wrap and mic tokens using the same functions. This required some contortions, and more would be required with the addition of xdr_buf's, so it's better to separate out the two code paths. In preparation for adding privacy support, remove the last vestiges of the old wrap token code. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/gss_krb5.h | 5 ++-- net/sunrpc/auth_gss/gss_krb5_mech.c | 5 ++-- net/sunrpc/auth_gss/gss_krb5_seal.c | 38 +++++---------------------- net/sunrpc/auth_gss/gss_krb5_unseal.c | 30 +++++---------------- 4 files changed, 16 insertions(+), 62 deletions(-) diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index ffe31d2eb9ec..cb35833e2ae3 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -121,13 +121,12 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, u32 krb5_make_token(struct krb5_ctx *context_handle, int qop_req, struct xdr_buf *input_message_buffer, - struct xdr_netobj *output_message_buffer, int toktype); + struct xdr_netobj *output_message_buffer); u32 krb5_read_token(struct krb5_ctx *context_handle, struct xdr_netobj *input_token_buffer, - struct xdr_buf *message_buffer, - int *qop_state, int toktype); + struct xdr_buf *message_buffer, int *qop_state); u32 krb5_encrypt(struct crypto_tfm * key, diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 462c5b86b073..8b9066fdfda5 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -199,8 +199,7 @@ gss_verify_mic_kerberos(struct gss_ctx *ctx, int qop_state; struct krb5_ctx *kctx = ctx->internal_ctx_id; - maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, - KG_TOK_MIC_MSG); + maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); if (!maj_stat && qop_state) *qstate = qop_state; @@ -216,7 +215,7 @@ gss_get_mic_kerberos(struct gss_ctx *ctx, u32 err = 0; struct krb5_ctx *kctx = ctx->internal_ctx_id; - err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); + err = krb5_make_token(kctx, qop, message, mic_token); dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index afeeb8715a77..2511834e6e52 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -70,22 +70,12 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -static inline int -gss_krb5_padding(int blocksize, int length) { - /* Most of the code is block-size independent but in practice we - * use only 8: */ - BUG_ON(blocksize != 8); - return 8 - (length & 7); -} - u32 krb5_make_token(struct krb5_ctx *ctx, int qop_req, - struct xdr_buf *text, struct xdr_netobj *token, - int toktype) + struct xdr_buf *text, struct xdr_netobj *token) { s32 checksum_type; struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; - int blocksize = 0, tmsglen; unsigned char *ptr, *krb5_hdr, *msg_start; s32 now; @@ -111,21 +101,13 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, goto out_err; } - if (toktype == KG_TOK_WRAP_MSG) { - blocksize = crypto_tfm_alg_blocksize(ctx->enc); - tmsglen = blocksize + text->len - + gss_krb5_padding(blocksize, blocksize + text->len); - } else { - tmsglen = 0; - } - - token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); + token->len = g_token_size(&ctx->mech_used, 22); ptr = token->data; - g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); + g_make_token_header(&ctx->mech_used, 22, &ptr); - *ptr++ = (unsigned char) ((toktype>>8)&0xff); - *ptr++ = (unsigned char) (toktype&0xff); + *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ krb5_hdr = ptr - 2; @@ -133,17 +115,9 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); memset(krb5_hdr + 4, 0xff, 4); - if (toktype == KG_TOK_WRAP_MSG) - *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); - if (toktype == KG_TOK_WRAP_MSG) { - /* XXX removing support for now */ + if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum)) goto out_err; - } else { /* Sign only. */ - if (make_checksum(checksum_type, krb5_hdr, 8, text, - &md5cksum)) - goto out_err; - } switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 8767fc53183d..19eba3df6607 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -68,20 +68,13 @@ #endif -/* message_buffer is an input if toktype is MIC and an output if it is WRAP: - * If toktype is MIC: read_token is a mic token, and message_buffer is the - * data that the mic was supposedly taken over. - * If toktype is WRAP: read_token is a wrap token, and message_buffer is used - * to return the decrypted data. - */ +/* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ -/* XXX will need to change prototype and/or just split into a separate function - * when we add privacy (because read_token will be in pages too). */ u32 krb5_read_token(struct krb5_ctx *ctx, struct xdr_netobj *read_token, - struct xdr_buf *message_buffer, - int *qop_state, int toktype) + struct xdr_buf *message_buffer, int *qop_state) { int signalg; int sealalg; @@ -100,16 +93,12 @@ krb5_read_token(struct krb5_ctx *ctx, read_token->len)) goto out; - if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) + if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || + (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) goto out; /* XXX sanity-check bodysize?? */ - if (toktype == KG_TOK_WRAP_MSG) { - /* XXX gone */ - goto out; - } - /* get the sign and seal algorithms */ signalg = ptr[0] + (ptr[1] << 8); @@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx, if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) goto out; - if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || - ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) - goto out; - - /* in the current spec, there is only one valid seal algorithm per - key type, so a simple comparison is ok */ - - if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) + if (sealalg != 0xffff) goto out; /* there are several mappings of seal algorithms to sign algorithms, From 14ae162c24d985593d5b19437d7f3d8fd0062b59 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:13 -0400 Subject: [PATCH 070/126] RPCSEC_GSS: Add support for privacy to krb5 rpcsec_gss mechanism. Add support for privacy to the krb5 rpcsec_gss mechanism. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/gss_krb5.h | 18 +- net/sunrpc/auth_gss/Makefile | 2 +- net/sunrpc/auth_gss/gss_krb5_crypto.c | 156 ++++++++++- net/sunrpc/auth_gss/gss_krb5_mech.c | 7 + net/sunrpc/auth_gss/gss_krb5_seal.c | 4 +- net/sunrpc/auth_gss/gss_krb5_unseal.c | 2 +- net/sunrpc/auth_gss/gss_krb5_wrap.c | 370 ++++++++++++++++++++++++++ 7 files changed, 552 insertions(+), 7 deletions(-) create mode 100644 net/sunrpc/auth_gss/gss_krb5_wrap.c diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index cb35833e2ae3..7f93c2d5ebdb 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -116,7 +116,7 @@ enum seal_alg { s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, - struct xdr_netobj *cksum); + int body_offset, struct xdr_netobj *cksum); u32 krb5_make_token(struct krb5_ctx *context_handle, int qop_req, @@ -128,6 +128,15 @@ krb5_read_token(struct krb5_ctx *context_handle, struct xdr_netobj *input_token_buffer, struct xdr_buf *message_buffer, int *qop_state); +u32 +gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset, + struct xdr_buf *outbuf, struct page **pages); + +u32 +gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset, + struct xdr_buf *buf); + + u32 krb5_encrypt(struct crypto_tfm * key, void *iv, void *in, void *out, int length); @@ -136,6 +145,13 @@ u32 krb5_decrypt(struct crypto_tfm * key, void *iv, void *in, void *out, int length); +int +gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset, + struct page **pages); + +int +gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset); + s32 krb5_make_seq_num(struct crypto_tfm * key, int direction, diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index fe1b874084bc..f3431a7e33da 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o + gss_krb5_seqnum.o gss_krb5_wrap.o obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 2baf93f8b8f5..3f3d5437f02d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -218,7 +218,7 @@ checksummer(struct scatterlist *sg, void *data) /* checksum the plaintext data and hdrlen bytes of the token header */ s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, - struct xdr_netobj *cksum) + int body_offset, struct xdr_netobj *cksum) { char *cksumname; struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ @@ -243,7 +243,8 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, crypto_digest_init(tfm); buf_to_sg(sg, header, hdrlen); crypto_digest_update(tfm, sg, 1); - process_xdr_buf(body, 0, body->len, checksummer, tfm); + process_xdr_buf(body, body_offset, body->len - body_offset, + checksummer, tfm); crypto_digest_final(tfm, cksum->data); code = 0; out: @@ -252,3 +253,154 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, } EXPORT_SYMBOL(make_checksum); + +struct encryptor_desc { + u8 iv[8]; /* XXX hard-coded blocksize */ + struct crypto_tfm *tfm; + int pos; + struct xdr_buf *outbuf; + struct page **pages; + struct scatterlist infrags[4]; + struct scatterlist outfrags[4]; + int fragno; + int fraglen; +}; + +static int +encryptor(struct scatterlist *sg, void *data) +{ + struct encryptor_desc *desc = data; + struct xdr_buf *outbuf = desc->outbuf; + struct page *in_page; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + int page_pos; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + desc->infrags[desc->fragno] = *sg; + desc->outfrags[desc->fragno] = *sg; + + page_pos = desc->pos - outbuf->head[0].iov_len; + if (page_pos >= 0 && page_pos < outbuf->page_len) { + /* pages are not in place: */ + int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; + in_page = desc->pages[i]; + } else { + in_page = sg->page; + } + desc->infrags[desc->fragno].page = in_page; + desc->fragno++; + desc->fraglen += sg->length; + desc->pos += sg->length; + + fraglen = thislen & 7; /* XXX hardcoded blocksize */ + thislen -= fraglen; + + if (thislen == 0) + return 0; + + ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags, + thislen, desc->iv); + if (ret) + return ret; + if (fraglen) { + desc->outfrags[0].page = sg->page; + desc->outfrags[0].offset = sg->offset + sg->length - fraglen; + desc->outfrags[0].length = fraglen; + desc->infrags[0] = desc->outfrags[0]; + desc->infrags[0].page = in_page; + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset, + struct page **pages) +{ + int ret; + struct encryptor_desc desc; + + BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.tfm = tfm; + desc.pos = offset; + desc.outbuf = buf; + desc.pages = pages; + desc.fragno = 0; + desc.fraglen = 0; + + ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc); + return ret; +} + +EXPORT_SYMBOL(gss_encrypt_xdr_buf); + +struct decryptor_desc { + u8 iv[8]; /* XXX hard-coded blocksize */ + struct crypto_tfm *tfm; + struct scatterlist frags[4]; + int fragno; + int fraglen; +}; + +static int +decryptor(struct scatterlist *sg, void *data) +{ + struct decryptor_desc *desc = data; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + desc->frags[desc->fragno] = *sg; + desc->fragno++; + desc->fraglen += sg->length; + + fraglen = thislen & 7; /* XXX hardcoded blocksize */ + thislen -= fraglen; + + if (thislen == 0) + return 0; + + ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags, + thislen, desc->iv); + if (ret) + return ret; + if (fraglen) { + desc->frags[0].page = sg->page; + desc->frags[0].offset = sg->offset + sg->length - fraglen; + desc->frags[0].length = fraglen; + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset) +{ + struct decryptor_desc desc; + + /* XXXJBF: */ + BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.tfm = tfm; + desc.fragno = 0; + desc.fraglen = 0; + return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc); +} + +EXPORT_SYMBOL(gss_decrypt_xdr_buf); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 8b9066fdfda5..37a9ad97ccd4 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -226,6 +226,8 @@ static struct gss_api_ops gss_kerberos_ops = { .gss_import_sec_context = gss_import_sec_context_kerberos, .gss_get_mic = gss_get_mic_kerberos, .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, .gss_delete_sec_context = gss_delete_sec_context_kerberos, }; @@ -240,6 +242,11 @@ static struct pf_desc gss_kerberos_pfs[] = { .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", }, + [2] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5P, + .service = RPC_GSS_SVC_PRIVACY, + .name = "krb5p", + }, }; static struct gss_api_mech gss_kerberos_mech = { diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 2511834e6e52..fb852d9ab06f 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -116,8 +116,8 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); memset(krb5_hdr + 4, 0xff, 4); - if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum)) - goto out_err; + if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum)) + goto out_err; switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 19eba3df6607..c3d6d1bc100c 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -136,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx, switch (signalg) { case SGN_ALG_DES_MAC_MD5: ret = make_checksum(checksum_type, ptr - 2, 8, - message_buffer, &md5cksum); + message_buffer, 0, &md5cksum); if (ret) goto out; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c new file mode 100644 index 000000000000..ddcde6e42b23 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -0,0 +1,370 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static inline int +gss_krb5_padding(int blocksize, int length) +{ + /* Most of the code is block-size independent but currently we + * use only 8: */ + BUG_ON(blocksize != 8); + return 8 - (length & 7); +} + +static inline void +gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) +{ + int padding = gss_krb5_padding(blocksize, buf->len - offset); + char *p; + struct kvec *iov; + + if (buf->page_len || buf->tail[0].iov_len) + iov = &buf->tail[0]; + else + iov = &buf->head[0]; + p = iov->iov_base + iov->iov_len; + iov->iov_len += padding; + buf->len += padding; + memset(p, padding, padding); +} + +static inline int +gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) +{ + u8 *ptr; + u8 pad; + int len = buf->len; + + if (len <= buf->head[0].iov_len) { + pad = *(u8 *)(buf->head[0].iov_base + len - 1); + if (pad > buf->head[0].iov_len) + return -EINVAL; + buf->head[0].iov_len -= pad; + goto out; + } else + len -= buf->head[0].iov_len; + if (len <= buf->page_len) { + int last = (buf->page_base + len - 1) + >>PAGE_CACHE_SHIFT; + int offset = (buf->page_base + len - 1) + & (PAGE_CACHE_SIZE - 1); + ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA); + pad = *(ptr + offset); + kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA); + goto out; + } else + len -= buf->page_len; + BUG_ON(len > buf->tail[0].iov_len); + pad = *(u8 *)(buf->tail[0].iov_base + len - 1); +out: + /* XXX: NOTE: we do not adjust the page lengths--they represent + * a range of data in the real filesystem page cache, and we need + * to know that range so the xdr code can properly place read data. + * However adjusting the head length, as we do above, is harmless. + * In the case of a request that fits into a single page, the server + * also uses length and head length together to determine the original + * start of the request to copy the request for deferal; so it's + * easier on the server if we adjust head and tail length in tandem. + * It's not really a problem that we don't fool with the page and + * tail lengths, though--at worst badly formed xdr might lead the + * server to attempt to parse the padding. + * XXX: Document all these weird requirements for gss mechanism + * wrap/unwrap functions. */ + if (pad > blocksize) + return -EINVAL; + if (buf->len > pad) + buf->len -= pad; + else + return -EINVAL; + return 0; +} + +static inline void +make_confounder(char *p, int blocksize) +{ + static u64 i = 0; + u64 *q = (u64 *)p; + + /* rfc1964 claims this should be "random". But all that's really + * necessary is that it be unique. And not even that is necessary in + * our case since our "gssapi" implementation exists only to support + * rpcsec_gss, so we know that the only buffers we will ever encrypt + * already begin with a unique sequence number. Just to hedge my bets + * I'll make a half-hearted attempt at something unique, but ensuring + * uniqueness would mean worrying about atomicity and rollover, and I + * don't care enough. */ + + BUG_ON(blocksize != 8); + *q = i++; +} + +/* Assumptions: the head and tail of inbuf are ours to play with. + * The pages, however, may be real pages in the page cache and we replace + * them with scratch pages from **pages before writing to them. */ +/* XXX: obviously the above should be documentation of wrap interface, + * and shouldn't be in this kerberos-specific file. */ + +/* XXX factor out common code with seal/unseal. */ + +u32 +gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, + struct xdr_buf *buf, struct page **pages) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + int blocksize = 0, plainlen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + + dprintk("RPC: gss_wrap_kerberos\n"); + + now = get_seconds(); + + if (qop != 0) + goto out_err; + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_krb5_seal: kctx->signalg %d not" + " supported\n", kctx->signalg); + goto out_err; + } + if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) { + dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n", + kctx->sealalg); + goto out_err; + } + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); + plainlen = blocksize + buf->len - offset; + + headlen = g_token_size(&kctx->mech_used, 22 + plainlen) - + (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ + /* XXX Would be cleverer to encrypt while copying. */ + /* XXX bounds checking, slack, etc. */ + memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); + buf->head[0].iov_len += headlen; + buf->len += headlen; + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr); + + + *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; + msg_start = krb5_hdr + 24; + /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize); + + *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg); + + make_confounder(msg_start, blocksize); + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; + if (make_checksum(checksum_type, krb5_hdr, 8, buf, + offset + headlen - blocksize, &md5cksum)) + goto out_err; + buf->pages = tmp_pages; + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) + goto out_err; + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + + dprintk("RPC: make_seal_token: cksum data: \n"); + print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); + break; + default: + BUG(); + } + + kfree(md5cksum.data); + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ + if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, + kctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + + if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, + pages)) + goto out_err; + + kctx->seq_send++; + + return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); +out_err: + if (md5cksum.data) kfree(md5cksum.data); + return GSS_S_FAILURE; +} + +u32 +gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, + struct xdr_buf *buf) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg; + int sealalg; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + s32 now; + int direction; + s32 seqnum; + unsigned char *ptr; + int bodysize; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + void *data_start, *orig_start; + int data_len; + int blocksize; + + dprintk("RPC: gss_unwrap_kerberos\n"); + + ptr = (u8 *)buf->head[0].iov_base + offset; + if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, + buf->len - offset)) + goto out; + + if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || + (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) + goto out; + + /* XXX sanity-check bodysize?? */ + + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); + sealalg = ptr[2] + (ptr[3] << 8); + + /* Sanity checks */ + + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + + if (sealalg == 0xffff) + goto out; + + /* in the current spec, there is only one valid seal algorithm per + key type, so a simple comparison is ok */ + + if (sealalg != kctx->sealalg) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, + but few enough that we can try them all. */ + + if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (kctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) + goto out; + + if (gss_decrypt_xdr_buf(kctx->enc, buf, + ptr + 22 - (unsigned char *)buf->head[0].iov_base)) + goto out; + + /* compute the checksum of the message */ + + /* initialize the the cksum */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, buf, + ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum); + if (ret) + goto out; + + ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len); + if (ret) + goto out; + + if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { + ret = GSS_S_BAD_SIG; + goto out; + } + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + /* it got through unscathed. Make sure the context is unexpired */ + + if (qop) + *qop = GSS_C_QOP_DEFAULT; + + now = get_seconds(); + + ret = GSS_S_CONTEXT_EXPIRED; + if (now > kctx->endtime) + goto out; + + /* do sequencing checks */ + + ret = GSS_S_BAD_SIG; + if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, + &seqnum))) + goto out; + + if ((kctx->initiate && direction != 0xff) || + (!kctx->initiate && direction != 0)) + goto out; + + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + data_start = ptr + 22 + blocksize; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); + buf->head[0].iov_len -= (data_start - orig_start); + buf->len -= (data_start - orig_start); + + ret = GSS_S_DEFECTIVE_TOKEN; + if (gss_krb5_remove_padding(buf, blocksize)) + goto out; + + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) kfree(md5cksum.data); + return ret; +} From 00fd6e14255fe7a249315746386d640bc4e9e758 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:18 -0400 Subject: [PATCH 071/126] RPCSEC_GSS remove all qop parameters Not only are the qop parameters that are passed around throughout the gssapi unused by any currently implemented mechanism, but there appears to be some doubt as to whether they will ever be used. Let's just kill them off for now. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/gss_api.h | 13 ++----------- include/linux/sunrpc/gss_err.h | 10 ---------- include/linux/sunrpc/gss_krb5.h | 8 ++++---- include/linux/sunrpc/gss_spkm3.h | 4 ++-- net/sunrpc/auth_gss/auth_gss.c | 20 +++++++------------- net/sunrpc/auth_gss/gss_krb5_mech.c | 12 ++++-------- net/sunrpc/auth_gss/gss_krb5_seal.c | 5 +---- net/sunrpc/auth_gss/gss_krb5_unseal.c | 5 +---- net/sunrpc/auth_gss/gss_krb5_wrap.c | 11 ++--------- net/sunrpc/auth_gss/gss_mech_switch.c | 14 ++++---------- net/sunrpc/auth_gss/gss_spkm3_mech.c | 21 ++++++++------------- net/sunrpc/auth_gss/gss_spkm3_seal.c | 4 +--- net/sunrpc/auth_gss/gss_spkm3_unseal.c | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 9 ++++----- 14 files changed, 41 insertions(+), 97 deletions(-) diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index e896752ffbf9..9b8bcf125c18 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -40,23 +40,19 @@ int gss_import_sec_context( struct gss_ctx **ctx_id); u32 gss_get_mic( struct gss_ctx *ctx_id, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token); u32 gss_verify_mic( struct gss_ctx *ctx_id, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate); + struct xdr_netobj *mic_token); u32 gss_wrap( struct gss_ctx *ctx_id, - u32 qop, int offset, struct xdr_buf *outbuf, struct page **inpages); u32 gss_unwrap( struct gss_ctx *ctx_id, - u32 *qop, int offset, struct xdr_buf *inbuf); u32 gss_delete_sec_context( @@ -67,7 +63,6 @@ char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); struct pf_desc { u32 pseudoflavor; - u32 qop; u32 service; char *name; char *auth_domain_name; @@ -96,23 +91,19 @@ struct gss_api_ops { struct gss_ctx *ctx_id); u32 (*gss_get_mic)( struct gss_ctx *ctx_id, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token); u32 (*gss_verify_mic)( struct gss_ctx *ctx_id, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate); + struct xdr_netobj *mic_token); u32 (*gss_wrap)( struct gss_ctx *ctx_id, - u32 qop, int offset, struct xdr_buf *outbuf, struct page **inpages); u32 (*gss_unwrap)( struct gss_ctx *ctx_id, - u32 *qop, int offset, struct xdr_buf *buf); void (*gss_delete_sec_context)( diff --git a/include/linux/sunrpc/gss_err.h b/include/linux/sunrpc/gss_err.h index 92608a2e574c..a6807867bd21 100644 --- a/include/linux/sunrpc/gss_err.h +++ b/include/linux/sunrpc/gss_err.h @@ -65,16 +65,6 @@ typedef unsigned int OM_uint32; #define GSS_C_MECH_CODE 2 -/* - * Define the default Quality of Protection for per-message services. Note - * that an implementation that offers multiple levels of QOP may either reserve - * a value (for example zero, as assumed here) to mean "default protection", or - * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit - * QOP value. However a value of 0 should always be interpreted by a GSSAPI - * implementation as a request for the default protection level. - */ -#define GSS_C_QOP_DEFAULT 0 - /* * Expiration time of 2^32-1 seconds means infinite lifetime for a * credential or security context diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index 7f93c2d5ebdb..a7bda4edb853 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -119,21 +119,21 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, int body_offset, struct xdr_netobj *cksum); u32 -krb5_make_token(struct krb5_ctx *context_handle, int qop_req, +krb5_make_token(struct krb5_ctx *context_handle, struct xdr_buf *input_message_buffer, struct xdr_netobj *output_message_buffer); u32 krb5_read_token(struct krb5_ctx *context_handle, struct xdr_netobj *input_token_buffer, - struct xdr_buf *message_buffer, int *qop_state); + struct xdr_buf *message_buffer); u32 -gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset, +gss_wrap_kerberos(struct gss_ctx *ctx_id, int offset, struct xdr_buf *outbuf, struct page **pages); u32 -gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset, +gss_unwrap_kerberos(struct gss_ctx *ctx_id, int offset, struct xdr_buf *buf); diff --git a/include/linux/sunrpc/gss_spkm3.h b/include/linux/sunrpc/gss_spkm3.h index b5c9968c3c17..0beb2cf00a84 100644 --- a/include/linux/sunrpc/gss_spkm3.h +++ b/include/linux/sunrpc/gss_spkm3.h @@ -41,9 +41,9 @@ struct spkm3_ctx { #define SPKM_WRAP_TOK 5 #define SPKM_DEL_TOK 6 -u32 spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, struct xdr_buf * text, struct xdr_netobj * token, int toktype); +u32 spkm3_make_token(struct spkm3_ctx *ctx, struct xdr_buf * text, struct xdr_netobj * token, int toktype); -u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, struct xdr_buf *message_buffer, int *qop_state, int toktype); +u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, struct xdr_buf *message_buffer, int toktype); #define CKSUMTYPE_RSA_MD5 0x0007 diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 5e4872058ec7..f44f46f1d8e0 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -854,9 +854,7 @@ gss_marshal(struct rpc_task *task, u32 *p) *p++ = htonl(RPC_AUTH_GSS); mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx->gc_gss_ctx, - GSS_C_QOP_DEFAULT, - &verf_buf, &mic); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) { cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; } else if (maj_stat != 0) { @@ -888,7 +886,7 @@ gss_validate(struct rpc_task *task, u32 *p) { struct rpc_cred *cred = task->tk_msg.rpc_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); - u32 seq, qop_state; + u32 seq; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; @@ -909,7 +907,7 @@ gss_validate(struct rpc_task *task, u32 *p) mic.data = (u8 *)p; mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat) @@ -961,8 +959,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, p = iov->iov_base + iov->iov_len; mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx->gc_gss_ctx, - GSS_C_QOP_DEFAULT, &integ_buf, &mic); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); status = -EIO; /* XXX? */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; @@ -1057,8 +1054,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); snd_buf->tail[0].iov_base = tmp; } - maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset, - snd_buf, inpages); + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* RPC_SLACK_SPACE should prevent this ever happening: */ BUG_ON(snd_buf->len > snd_buf->buflen); status = -EIO; @@ -1150,8 +1146,7 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) return status; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, - &mic, NULL); + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat != GSS_S_COMPLETE) @@ -1176,8 +1171,7 @@ gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, /* remove padding: */ rcv_buf->len = offset + opaque_len; - maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL, - offset, rcv_buf); + maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat != GSS_S_COMPLETE) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 37a9ad97ccd4..9ffac2c50b94 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -193,15 +193,12 @@ gss_delete_sec_context_kerberos(void *internal_ctx) { static u32 gss_verify_mic_kerberos(struct gss_ctx *ctx, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate) { + struct xdr_netobj *mic_token) +{ u32 maj_stat = 0; - int qop_state; struct krb5_ctx *kctx = ctx->internal_ctx_id; - maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); - if (!maj_stat && qop_state) - *qstate = qop_state; + maj_stat = krb5_read_token(kctx, mic_token, message); dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); return maj_stat; @@ -209,13 +206,12 @@ gss_verify_mic_kerberos(struct gss_ctx *ctx, static u32 gss_get_mic_kerberos(struct gss_ctx *ctx, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token) { u32 err = 0; struct krb5_ctx *kctx = ctx->internal_ctx_id; - err = krb5_make_token(kctx, qop, message, mic_token); + err = krb5_make_token(kctx, message, mic_token); dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index fb852d9ab06f..15227c727c8b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -71,7 +71,7 @@ #endif u32 -krb5_make_token(struct krb5_ctx *ctx, int qop_req, +krb5_make_token(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) { s32 checksum_type; @@ -83,9 +83,6 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, now = get_seconds(); - if (qop_req != 0) - goto out_err; - switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: checksum_type = CKSUMTYPE_RSA_MD5; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index c3d6d1bc100c..bcf978627a71 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -74,7 +74,7 @@ u32 krb5_read_token(struct krb5_ctx *ctx, struct xdr_netobj *read_token, - struct xdr_buf *message_buffer, int *qop_state) + struct xdr_buf *message_buffer) { int signalg; int sealalg; @@ -157,9 +157,6 @@ krb5_read_token(struct krb5_ctx *ctx, /* it got through unscathed. Make sure the context is unexpired */ - if (qop_state) - *qop_state = GSS_C_QOP_DEFAULT; - now = get_seconds(); ret = GSS_S_CONTEXT_EXPIRED; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index ddcde6e42b23..af777cf9f251 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -116,7 +116,7 @@ make_confounder(char *p, int blocksize) /* XXX factor out common code with seal/unseal. */ u32 -gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, +gss_wrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf, struct page **pages) { struct krb5_ctx *kctx = ctx->internal_ctx_id; @@ -132,9 +132,6 @@ gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, now = get_seconds(); - if (qop != 0) - goto out_err; - switch (kctx->signalg) { case SGN_ALG_DES_MAC_MD5: checksum_type = CKSUMTYPE_RSA_MD5; @@ -229,8 +226,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, } u32 -gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, - struct xdr_buf *buf) +gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) { struct krb5_ctx *kctx = ctx->internal_ctx_id; int signalg; @@ -328,9 +324,6 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, /* it got through unscathed. Make sure the context is unexpired */ - if (qop) - *qop = GSS_C_QOP_DEFAULT; - now = get_seconds(); ret = GSS_S_CONTEXT_EXPIRED; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 06d97cb3481a..b048bf672da2 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -250,13 +250,11 @@ gss_import_sec_context(const void *input_token, size_t bufsize, u32 gss_get_mic(struct gss_ctx *context_handle, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token) { return context_handle->mech_type->gm_ops ->gss_get_mic(context_handle, - qop, message, mic_token); } @@ -266,35 +264,31 @@ gss_get_mic(struct gss_ctx *context_handle, u32 gss_verify_mic(struct gss_ctx *context_handle, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate) + struct xdr_netobj *mic_token) { return context_handle->mech_type->gm_ops ->gss_verify_mic(context_handle, message, - mic_token, - qstate); + mic_token); } u32 gss_wrap(struct gss_ctx *ctx_id, - u32 qop, int offset, struct xdr_buf *buf, struct page **inpages) { return ctx_id->mech_type->gm_ops - ->gss_wrap(ctx_id, qop, offset, buf, inpages); + ->gss_wrap(ctx_id, offset, buf, inpages); } u32 gss_unwrap(struct gss_ctx *ctx_id, - u32 *qop, int offset, struct xdr_buf *buf) { return ctx_id->mech_type->gm_ops - ->gss_unwrap(ctx_id, qop, offset, buf); + ->gss_unwrap(ctx_id, offset, buf); } diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c index 6c97d61baa9b..39b3edc14694 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_mech.c +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -224,18 +224,13 @@ gss_delete_sec_context_spkm3(void *internal_ctx) { static u32 gss_verify_mic_spkm3(struct gss_ctx *ctx, struct xdr_buf *signbuf, - struct xdr_netobj *checksum, - u32 *qstate) { + struct xdr_netobj *checksum) +{ u32 maj_stat = 0; - int qop_state = 0; struct spkm3_ctx *sctx = ctx->internal_ctx_id; dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n"); - maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state, - SPKM_MIC_TOK); - - if (!maj_stat && qop_state) - *qstate = qop_state; + maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK); dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); return maj_stat; @@ -243,15 +238,15 @@ gss_verify_mic_spkm3(struct gss_ctx *ctx, static u32 gss_get_mic_spkm3(struct gss_ctx *ctx, - u32 qop, struct xdr_buf *message_buffer, - struct xdr_netobj *message_token) { + struct xdr_netobj *message_token) +{ u32 err = 0; struct spkm3_ctx *sctx = ctx->internal_ctx_id; dprintk("RPC: gss_get_mic_spkm3\n"); - err = spkm3_make_token(sctx, qop, message_buffer, + err = spkm3_make_token(sctx, message_buffer, message_token, SPKM_MIC_TOK); return err; } @@ -264,8 +259,8 @@ static struct gss_api_ops gss_spkm3_ops = { }; static struct pf_desc gss_spkm3_pfs[] = { - {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"}, - {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, + {RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"}, + {RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, }; static struct gss_api_mech gss_spkm3_mech = { diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index 25339868d462..148201e929d0 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -51,7 +51,7 @@ */ u32 -spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, +spkm3_make_token(struct spkm3_ctx *ctx, struct xdr_buf * text, struct xdr_netobj * token, int toktype) { @@ -68,8 +68,6 @@ spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, dprintk("RPC: spkm3_make_token\n"); now = jiffies; - if (qop_req != 0) - goto out_err; if (ctx->ctx_id.len != 16) { dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c index 65ce81bf0bc4..c3c0d9586103 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -52,7 +52,7 @@ u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, /* checksum */ struct xdr_buf *message_buffer, /* signbuf */ - int *qop_state, int toktype) + int toktype) { s32 code; struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e3308195374e..e4ada15ed856 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -566,8 +566,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; - if (gss_verify_mic(ctx_id, &rpchdr, &checksum, NULL) - != GSS_S_COMPLETE) { + if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { *authp = rpcsec_gsserr_credproblem; return SVC_DENIED; } @@ -604,7 +603,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) xdr_buf_from_iov(&iov, &verf_data); p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx_id, 0, &verf_data, &mic); + maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); if (maj_stat != GSS_S_COMPLETE) return -1; *p++ = htonl(mic.len); @@ -710,7 +709,7 @@ unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) goto out; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) goto out; - maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); if (maj_stat != GSS_S_COMPLETE) goto out; if (ntohl(svc_getu32(&buf->head[0])) != seq) @@ -1012,7 +1011,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) resv = &resbuf->tail[0]; } mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; - if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) goto out_err; svc_putu32(resv, htonl(mic.len)); memset(mic.data + mic.len, 0, From a0857d03b21fa54653c9d2fe7a315381176015b4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:23 -0400 Subject: [PATCH 072/126] RPCSEC_GSS: krb5 cleanup Remove some senseless wrappers. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/gss_krb5.h | 12 ++++-------- net/sunrpc/auth_gss/gss_krb5_mech.c | 28 --------------------------- net/sunrpc/auth_gss/gss_krb5_seal.c | 5 +++-- net/sunrpc/auth_gss/gss_krb5_unseal.c | 6 +++--- 4 files changed, 10 insertions(+), 41 deletions(-) diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index a7bda4edb853..2c3601d31045 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -118,15 +118,11 @@ s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, int body_offset, struct xdr_netobj *cksum); -u32 -krb5_make_token(struct krb5_ctx *context_handle, - struct xdr_buf *input_message_buffer, - struct xdr_netobj *output_message_buffer); +u32 gss_get_mic_kerberos(struct gss_ctx *, struct xdr_buf *, + struct xdr_netobj *); -u32 -krb5_read_token(struct krb5_ctx *context_handle, - struct xdr_netobj *input_token_buffer, - struct xdr_buf *message_buffer); +u32 gss_verify_mic_kerberos(struct gss_ctx *, struct xdr_buf *, + struct xdr_netobj *); u32 gss_wrap_kerberos(struct gss_ctx *ctx_id, int offset, diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 9ffac2c50b94..5f1f806a0b11 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -190,34 +190,6 @@ gss_delete_sec_context_kerberos(void *internal_ctx) { kfree(kctx); } -static u32 -gss_verify_mic_kerberos(struct gss_ctx *ctx, - struct xdr_buf *message, - struct xdr_netobj *mic_token) -{ - u32 maj_stat = 0; - struct krb5_ctx *kctx = ctx->internal_ctx_id; - - maj_stat = krb5_read_token(kctx, mic_token, message); - - dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); - return maj_stat; -} - -static u32 -gss_get_mic_kerberos(struct gss_ctx *ctx, - struct xdr_buf *message, - struct xdr_netobj *mic_token) { - u32 err = 0; - struct krb5_ctx *kctx = ctx->internal_ctx_id; - - err = krb5_make_token(kctx, message, mic_token); - - dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); - - return err; -} - static struct gss_api_ops gss_kerberos_ops = { .gss_import_sec_context = gss_import_sec_context_kerberos, .gss_get_mic = gss_get_mic_kerberos, diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 15227c727c8b..13f8ae979454 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -71,9 +71,10 @@ #endif u32 -krb5_make_token(struct krb5_ctx *ctx, - struct xdr_buf *text, struct xdr_netobj *token) +gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, + struct xdr_netobj *token) { + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; s32 checksum_type; struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; unsigned char *ptr, *krb5_hdr, *msg_start; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index bcf978627a71..2030475d98ed 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -72,10 +72,10 @@ * supposedly taken over. */ u32 -krb5_read_token(struct krb5_ctx *ctx, - struct xdr_netobj *read_token, - struct xdr_buf *message_buffer) +gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) { + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; int signalg; int sealalg; s32 checksum_type; From 4e51336a00bdcb42960acca52c23e90e9f4e6959 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Oct 2005 14:22:41 -0700 Subject: [PATCH 073/126] NFSv4: Final tweak to sequence id Sacrifice queueing fairness for performance. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4state.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 45bff1d1a513..5c0dd26d0985 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -110,8 +110,8 @@ struct nfs_seqid_counter { }; struct nfs_seqid { - struct list_head list; struct nfs_seqid_counter *sequence; + struct list_head list; }; static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index da0861db57fb..c59ef90e956b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -670,15 +670,12 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) { - struct rpc_sequence *sequence = counter->sequence; struct nfs_seqid *new; new = kmalloc(sizeof(*new), GFP_KERNEL); if (new != NULL) { new->sequence = counter; - spin_lock(&sequence->lock); - list_add_tail(&new->list, &sequence->list); - spin_unlock(&sequence->lock); + INIT_LIST_HEAD(&new->list); } return new; } @@ -687,10 +684,12 @@ void nfs_free_seqid(struct nfs_seqid *seqid) { struct rpc_sequence *sequence = seqid->sequence->sequence; - spin_lock(&sequence->lock); - list_del(&seqid->list); - rpc_wake_up(&sequence->wait); - spin_unlock(&sequence->lock); + if (!list_empty(&seqid->list)) { + spin_lock(&sequence->lock); + list_del(&seqid->list); + spin_unlock(&sequence->lock); + } + rpc_wake_up_next(&sequence->wait); kfree(seqid); } @@ -746,12 +745,16 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) struct rpc_sequence *sequence = seqid->sequence->sequence; int status = 0; + if (sequence->list.next == &seqid->list) + goto out; spin_lock(&sequence->lock); - if (sequence->list.next != &seqid->list) { + if (!list_empty(&sequence->list)) { rpc_sleep_on(&sequence->wait, task, NULL, NULL); status = -EAGAIN; - } + } else + list_add(&seqid->list, &sequence->list); spin_unlock(&sequence->lock); +out: return status; } From ec073428281b401f1142cb84b277a5b00c7994c9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Oct 2005 14:22:47 -0700 Subject: [PATCH 074/126] NFSv4: Fix up locking for nfs4_state_owner Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 2 ++ fs/nfs/nfs4state.c | 20 +++++++++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5c0dd26d0985..78a53f5a9f18 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -126,6 +126,7 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status * semantics by allowing the server to identify replayed requests. */ struct nfs4_state_owner { + spinlock_t so_lock; struct list_head so_list; /* per-clientid list of state_owners */ struct nfs4_client *so_client; u32 so_id; /* 32-bit identifier, unique */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2d9357cf3fc5..9c1da34036aa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -212,6 +212,7 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, open_flags &= (FMODE_READ|FMODE_WRITE); /* Protect against nfs4_find_state() */ + spin_lock(&state->owner->so_lock); spin_lock(&inode->i_lock); state->state |= open_flags; /* NB! List reordering - see the reclaim code for why. */ @@ -221,6 +222,7 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, state->nreaders++; memcpy(&state->stateid, stateid, sizeof(state->stateid)); spin_unlock(&inode->i_lock); + spin_unlock(&state->owner->so_lock); } /* diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c59ef90e956b..2d5a6a2b9dec 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -267,6 +267,7 @@ nfs4_alloc_state_owner(void) sp = kzalloc(sizeof(*sp),GFP_KERNEL); if (!sp) return NULL; + spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); INIT_LIST_HEAD(&sp->so_delegations); rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); @@ -438,20 +439,23 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) if (state) goto out; new = nfs4_alloc_open_state(); + spin_lock(&owner->so_lock); spin_lock(&inode->i_lock); state = __nfs4_find_state_byowner(inode, owner); if (state == NULL && new != NULL) { state = new; - /* Note: The reclaim code dictates that we add stateless - * and read-only stateids to the end of the list */ - list_add_tail(&state->open_states, &owner->so_states); state->owner = owner; atomic_inc(&owner->so_count); list_add(&state->inode_states, &nfsi->open_states); state->inode = igrab(inode); spin_unlock(&inode->i_lock); + /* Note: The reclaim code dictates that we add stateless + * and read-only stateids to the end of the list */ + list_add_tail(&state->open_states, &owner->so_states); + spin_unlock(&owner->so_lock); } else { spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); if (new) nfs4_free_open_state(new); } @@ -468,12 +472,14 @@ void nfs4_put_open_state(struct nfs4_state *state) struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) + if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) return; + spin_lock(&inode->i_lock); if (!list_empty(&state->inode_states)) list_del(&state->inode_states); - spin_unlock(&inode->i_lock); list_del(&state->open_states); + spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); iput(inode); BUG_ON (state->state != 0); nfs4_free_open_state(state); @@ -491,6 +497,7 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) atomic_inc(&owner->so_count); /* Protect against nfs4_find_state() */ + spin_lock(&owner->so_lock); spin_lock(&inode->i_lock); if (mode & FMODE_READ) state->nreaders--; @@ -503,6 +510,7 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) list_move_tail(&state->open_states, &owner->so_states); } spin_unlock(&inode->i_lock); + spin_unlock(&owner->so_lock); newstate = 0; if (state->state != 0) { if (state->nreaders) @@ -899,6 +907,7 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp) list_for_each_entry(sp, &clp->cl_state_owners, so_list) { sp->so_seqid.counter = 0; sp->so_seqid.flags = 0; + spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.counter = 0; @@ -906,6 +915,7 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp) lock->ls_flags &= ~NFS_LOCK_INITIALIZED; } } + spin_unlock(&sp->so_lock); } } From a362f463a6d316d14daed0f817e151835ce97ff7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 27 Oct 2005 09:07:33 -0700 Subject: [PATCH 075/126] Revert "remove false BUG_ON() from run_posix_cpu_timers()" This reverts commit 3de463c7d9d58f8cf3395268230cb20a4c15bffa. Roland has another patch that allows us to leave the BUG_ON() in place by just making sure that the condition it tests for really is always true. That goes in next. Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 4897977a1f4b..3b25b182d2be 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -825,6 +825,14 @@ fastcall NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITING; + /* + * Make sure we don't try to process any timer firings + * while we are already exiting. + */ + tsk->it_virt_expires = cputime_zero; + tsk->it_prof_expires = cputime_zero; + tsk->it_sched_expires = 0; + if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, current->pid, diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 383ba22f0b62..ea1aca5e7c2b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1293,30 +1293,30 @@ void run_posix_cpu_timers(struct task_struct *tsk) #undef UNEXPIRED + BUG_ON(tsk->exit_state); + /* * Double-check with locks held. */ read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); + spin_lock(&tsk->sighand->siglock); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] + * all the timers that are firing, and put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); - } + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); read_unlock(&tasklist_lock); /* From 72ab373a5688a78cbdaf3bf96012e597d5399bb7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 27 Oct 2005 03:16:42 -0700 Subject: [PATCH 076/126] [PATCH] Yet more posix-cpu-timer fixes This just makes sure that a thread's expiry times can't get reset after it clears them in do_exit. This is what allowed us to re-introduce the stricter BUG_ON() check in a362f463a6d316d14daed0f817e151835ce97ff7. Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index ea1aca5e7c2b..bf374fceb39c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -497,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p, left = cputime_div(cputime_sub(expires.cpu, val.cpu), nthreads); do { - if (!unlikely(t->exit_state)) { + if (!unlikely(t->flags & PF_EXITING)) { ticks = cputime_add(prof_ticks(t), left); if (cputime_eq(t->it_prof_expires, cputime_zero) || @@ -512,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p, left = cputime_div(cputime_sub(expires.cpu, val.cpu), nthreads); do { - if (!unlikely(t->exit_state)) { + if (!unlikely(t->flags & PF_EXITING)) { ticks = cputime_add(virt_ticks(t), left); if (cputime_eq(t->it_virt_expires, cputime_zero) || @@ -527,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p, nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); do { - if (!unlikely(t->exit_state)) { + if (!unlikely(t->flags & PF_EXITING)) { ns = t->sched_time + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { @@ -566,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) struct cpu_timer_list *next; unsigned long i; + if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) + return; + head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); @@ -1204,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk, do { t = next_thread(t); - } while (unlikely(t->exit_state)); + } while (unlikely(t->flags & PF_EXITING)); } while (t != tsk); } } From 2ad41065d9fe518759b695fc2640cf9c07261dd2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 27 Oct 2005 18:47:46 +1000 Subject: [PATCH 077/126] [TCP]: Clear stale pred_flags when snd_wnd changes This bug is responsible for causing the infamous "Treason uncloaked" messages that's been popping up everywhere since the printk was added. It has usually been blamed on foreign operating systems. However, some of those reports implicate Linux as both systems are running Linux or the TCP connection is going across the loopback interface. In fact, there really is a bug in the Linux TCP header prediction code that's been there since at least 2.1.8. This bug was tracked down with help from Dale Blount. The effect of this bug ranges from harmless "Treason uncloaked" messages to hung/aborted TCP connections. The details of the bug and fix is as follows. When snd_wnd is updated, we only update pred_flags if tcp_fast_path_check succeeds. When it fails (for example, when our rcvbuf is used up), we will leave pred_flags with an out-of-date snd_wnd value. When the out-of-date pred_flags happens to match the next incoming packet we will again hit the fast path and use the current snd_wnd which will be wrong. In the case of the treason messages, it just happens that the snd_wnd cached in pred_flags is zero while tp->snd_wnd is non-zero. Therefore when a zero-window packet comes in we incorrectly conclude that the window is non-zero. In fact if the peer continues to send us zero-window pure ACKs we will continue making the same mistake. It's only when the peer transmits a zero-window packet with data attached that we get a chance to snap out of it. This is what triggers the treason message at the next retransmit timeout. Signed-off-by: Herbert Xu Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 677419d0c9ad..3e98b57578dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2239,6 +2239,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, /* Note, it is the only place, where * fast path is recovered for sending TCP. */ + tp->pred_flags = 0; tcp_fast_path_check(sk, tp); if (nwin > tp->max_window) { From 6fa05b17367f04ada501e89d3b9cb56adec0d930 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 19:08:18 -0400 Subject: [PATCH 078/126] Revert "RPC: stops the release_pipe() funtion from being called twice" This reverts 747c5534c9a6da4aa87e7cdc2209ea98ea27f381 commit. --- net/sunrpc/rpc_pipe.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 649d609e7d94..ded6c63f11ec 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -177,8 +177,6 @@ rpc_pipe_release(struct inode *inode, struct file *filp) __rpc_purge_upcall(inode, -EPIPE); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); - if (!rpci->nreaders && !rpci->nwriters) - rpci->ops = NULL; out: up(&inode->i_sem); return 0; From 34123da66e613602de5a886b05c875b6a91b8ed2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 19:10:09 -0400 Subject: [PATCH 079/126] NFS: Fix a bad cast in nfs3_read_done Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index df80477c5af4..e4a1cd48195e 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -735,7 +735,7 @@ extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); static void nfs3_read_done(struct rpc_task *task) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; if (nfs3_async_handle_jukebox(task)) return; From 79b95a454bb5c1d9b7287d1016a70885ba3f346c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 27 Oct 2005 16:28:39 -0700 Subject: [PATCH 080/126] Revert "x86-64: Avoid unnecessary double bouncing for swiotlb" Commit id 6142891a0c0209c91aa4a98f725de0d6e2ed4918 Andi Kleen reports that it seems to break things for some people, and since it's purely a small optimization, revert it for now. Signed-off-by: Linus Torvalds --- include/asm-x86_64/pci.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h index 5a82a6762c21..eeb3088a1c9e 100644 --- a/include/asm-x86_64/pci.h +++ b/include/asm-x86_64/pci.h @@ -50,10 +50,10 @@ extern int iommu_setup(char *opt); * address space. The networking and block device layers use * this boolean for bounce buffer decisions * - * On x86-64 it mostly equals, but we set it to zero to tell some subsystems - * that an hard or soft IOMMU is available. + * On AMD64 it mostly equals, but we set it to zero to tell some subsystems + * that an IOMMU is available. */ -#define PCI_DMA_BUS_IS_PHYS 0 +#define PCI_DMA_BUS_IS_PHYS (no_iommu ? 1 : 0) /* * x86-64 always supports DAC, but sometimes it is useful to force From 927321440976d0781a252eefe686ae6b0f236ae2 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 27 Oct 2005 16:16:25 -0700 Subject: [PATCH 081/126] [PATCH] cpufreq: SMP fix for conservative governor Don't try to access not-present CPUs. Conservative governor will always oops on SMP without this fix. Fixes http://bugzilla.kernel.org/show_bug.cgi?id=4781 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/cpufreq/cpufreq_conservative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index e1df376e709e..2ed5c4363b53 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -315,9 +315,9 @@ static void dbs_check_cpu(int cpu) policy = this_dbs_info->cur_policy; if ( init_flag == 0 ) { - for ( /* NULL */; init_flag < NR_CPUS; init_flag++ ) { - dbs_info = &per_cpu(cpu_dbs_info, init_flag); - requested_freq[cpu] = dbs_info->cur_policy->cur; + for_each_online_cpu(j) { + dbs_info = &per_cpu(cpu_dbs_info, j); + requested_freq[j] = dbs_info->cur_policy->cur; } init_flag = 1; } From 741b2252a5e14d6c60a913c77a6099abe73a854a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 27 Oct 2005 17:02:08 -0700 Subject: [PATCH 082/126] Linux v2.6.14 "Better late than never" --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4a7000e353b5..1fa7e5343464 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 14 -EXTRAVERSION =-rc5 +EXTRAVERSION = NAME=Affluent Albatross # *DOCUMENTATION* From 0e574af1be5f569a5d7f2800333b0bfb358a5e34 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:38 -0400 Subject: [PATCH 083/126] NFS: Cleanup initialisation of struct nfs_fattr Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 1 + fs/nfs/nfs3proc.c | 58 +++++++++++++++++++++--------------------- fs/nfs/nfs4proc.c | 43 +++++++++++++++---------------- fs/nfs/proc.c | 26 +++++++++---------- fs/nfs/read.c | 1 + fs/nfs/write.c | 2 ++ include/linux/nfs_fs.h | 5 ++++ 7 files changed, 71 insertions(+), 65 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index eb50c19fc253..b8a73045e9a0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -532,6 +532,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) my_entry.eof = 0; my_entry.fh = &fh; my_entry.fattr = &fattr; + nfs_fattr_init(&fattr); desc->entry = &my_entry; while(!desc->entry->eof) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index e4a1cd48195e..4b1b48b139f6 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -78,7 +78,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("%s: call fsinfo\n", __FUNCTION__); - info->fattr->valid = 0; + nfs_fattr_init(info->fattr); status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); if (!(info->fattr->valid & NFS_ATTR_FATTR)) { @@ -98,7 +98,7 @@ nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call getattr\n"); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call(server->client, NFS3PROC_GETATTR, fhandle, fattr, 0); dprintk("NFS reply getattr: %d\n", status); @@ -117,7 +117,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; dprintk("NFS call setattr\n"); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0); if (status == 0) nfs_setattr_update_inode(inode, sattr); @@ -143,8 +143,8 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name, int status; dprintk("NFS call lookup %s\n", name->name); - dir_attr.valid = 0; - fattr->valid = 0; + nfs_fattr_init(&dir_attr); + nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_LOOKUP, &arg, &res, 0); if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR, @@ -174,7 +174,6 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) int status; dprintk("NFS call access\n"); - fattr.valid = 0; if (mode & MAY_READ) arg.access |= NFS3_ACCESS_READ; @@ -189,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) if (mode & MAY_EXEC) arg.access |= NFS3_ACCESS_EXECUTE; } + nfs_fattr_init(&fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_refresh_inode(inode, &fattr); if (status == 0) { @@ -217,7 +217,7 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, int status; dprintk("NFS call readlink\n"); - fattr.valid = 0; + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(inode), NFS3PROC_READLINK, &args, &fattr, 0); nfs_refresh_inode(inode, &fattr); @@ -240,7 +240,7 @@ static int nfs3_proc_read(struct nfs_read_data *rdata) dprintk("NFS call read %d @ %Ld\n", rdata->args.count, (long long) rdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); if (status >= 0) nfs_refresh_inode(inode, fattr); @@ -263,7 +263,7 @@ static int nfs3_proc_write(struct nfs_write_data *wdata) dprintk("NFS call write %d @ %Ld\n", wdata->args.count, (long long) wdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags); if (status >= 0) nfs_refresh_inode(inode, fattr); @@ -285,7 +285,7 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata) dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, (long long) cdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status >= 0) nfs_refresh_inode(inode, fattr); @@ -329,8 +329,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, sattr->ia_mode &= ~current->fs->umask; again: - dir_attr.valid = 0; - fattr.valid = 0; + nfs_fattr_init(&dir_attr); + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); @@ -401,7 +401,7 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name) int status; dprintk("NFS call remove %s\n", name->name); - dir_attr.valid = 0; + nfs_fattr_init(&dir_attr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_refresh_inode(dir, &dir_attr); dprintk("NFS reply remove: %d\n", status); @@ -422,7 +422,7 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr ptr->arg.fh = NFS_FH(dir->d_inode); ptr->arg.name = name->name; ptr->arg.len = name->len; - ptr->res.valid = 0; + nfs_fattr_init(&ptr->res); msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; msg->rpc_argp = &ptr->arg; msg->rpc_resp = &ptr->res; @@ -465,8 +465,8 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, int status; dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - old_dir_attr.valid = 0; - new_dir_attr.valid = 0; + nfs_fattr_init(&old_dir_attr); + nfs_fattr_init(&new_dir_attr); status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0); nfs_refresh_inode(old_dir, &old_dir_attr); nfs_refresh_inode(new_dir, &new_dir_attr); @@ -491,8 +491,8 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) int status; dprintk("NFS call link %s\n", name->name); - dir_attr.valid = 0; - fattr.valid = 0; + nfs_fattr_init(&dir_attr); + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); nfs_refresh_inode(inode, &fattr); @@ -524,8 +524,8 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, if (path->len > NFS3_MAXPATHLEN) return -ENAMETOOLONG; dprintk("NFS call symlink %s -> %s\n", name->name, path->name); - dir_attr.valid = 0; - fattr->valid = 0; + nfs_fattr_init(&dir_attr); + nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); dprintk("NFS reply symlink: %d\n", status); @@ -552,11 +552,11 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) int status; dprintk("NFS call mkdir %s\n", dentry->d_name.name); - dir_attr.valid = 0; - fattr.valid = 0; sattr->ia_mode &= ~current->fs->umask; + nfs_fattr_init(&dir_attr); + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); if (status != 0) @@ -582,7 +582,7 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name) int status; dprintk("NFS call rmdir %s\n", name->name); - dir_attr.valid = 0; + nfs_fattr_init(&dir_attr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0); nfs_refresh_inode(dir, &dir_attr); dprintk("NFS reply rmdir: %d\n", status); @@ -634,7 +634,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, dprintk("NFS call readdir%s %d\n", plus? "plus" : "", (unsigned int) cookie); - dir_attr.valid = 0; + nfs_fattr_init(&dir_attr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_refresh_inode(dir, &dir_attr); dprintk("NFS reply readdir: %d\n", status); @@ -676,8 +676,8 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, sattr->ia_mode &= ~current->fs->umask; - dir_attr.valid = 0; - fattr.valid = 0; + nfs_fattr_init(&dir_attr); + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); if (status != 0) @@ -698,7 +698,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call fsstat\n"); - stat->fattr->valid = 0; + nfs_fattr_init(stat->fattr); status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0); dprintk("NFS reply statfs: %d\n", status); return status; @@ -711,7 +711,7 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call fsinfo\n"); - info->fattr->valid = 0; + nfs_fattr_init(info->fattr); status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; @@ -724,7 +724,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call pathconf\n"); - info->fattr->valid = 0; + nfs_fattr_init(info->fattr); status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); dprintk("NFS reply pathconf: %d\n", status); return status; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9c1da34036aa..2a759e8e387c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -494,9 +494,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(dir); struct nfs_delegation *delegation = NFS_I(inode)->delegation; - struct nfs_fattr f_attr = { - .valid = 0, - }; + struct nfs_fattr f_attr; struct nfs_openargs o_arg = { .fh = NFS_FH(dir), .open_flags = state->state, @@ -522,6 +520,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st status = -ENOMEM; if (o_arg.seqid == NULL) goto out; + nfs_fattr_init(&f_attr); status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_nodeleg; @@ -692,9 +691,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st struct nfs4_client *clp = server->nfs4_state; struct inode *inode = NULL; int status; - struct nfs_fattr f_attr = { - .valid = 0, - }; + struct nfs_fattr f_attr; struct nfs_openargs o_arg = { .fh = NFS_FH(dir), .open_flags = flags, @@ -726,6 +723,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); if (o_arg.seqid == NULL) return -ENOMEM; + nfs_fattr_init(&f_attr); status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_err; @@ -824,7 +822,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, }; int status; - fattr->valid = 0; + nfs_fattr_init(fattr); if (state != NULL) { msg.rpc_cred = state->owner->so_cred; @@ -1107,13 +1105,12 @@ static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fh static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - struct nfs_fattr * fattr = info->fattr; struct nfs4_lookup_root_arg args = { .bitmask = nfs4_fattr_bitmap, }; struct nfs4_lookup_res res = { .server = server, - .fattr = fattr, + .fattr = info->fattr, .fh = fhandle, }; struct rpc_message msg = { @@ -1121,7 +1118,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - fattr->valid = 0; + nfs_fattr_init(info->fattr); return rpc_call_sync(server->client, &msg, 0); } @@ -1184,7 +1181,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, q.len = p - q.name; do { - fattr->valid = 0; + nfs_fattr_init(fattr); status = nfs4_handle_exception(server, rpc_call_sync(server->client, &msg, 0), &exception); @@ -1221,7 +1218,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - fattr->valid = 0; + nfs_fattr_init(fattr); return rpc_call_sync(server->client, &msg, 0); } @@ -1263,7 +1260,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs4_state *state; int status; - fattr->valid = 0; + nfs_fattr_init(fattr); cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); if (IS_ERR(cred)) @@ -1309,7 +1306,7 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name, .rpc_resp = &res, }; - fattr->valid = 0; + nfs_fattr_init(fattr); dprintk("NFS call lookup %s\n", name->name); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); @@ -1458,7 +1455,7 @@ static int _nfs4_proc_read(struct nfs_read_data *rdata) dprintk("NFS call read %d @ %Ld\n", rdata->args.count, (long long) rdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, flags); if (!status) renew_lease(server, timestamp); @@ -1495,7 +1492,7 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata) dprintk("NFS call write %d @ %Ld\n", wdata->args.count, (long long) wdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, rpcflags); dprintk("NFS reply write: %d\n", status); return status; @@ -1529,7 +1526,7 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata) dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, (long long) cdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply commit: %d\n", status); return status; @@ -1769,7 +1766,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, if (path->len > NFS4_MAXPATHLEN) return -ENAMETOOLONG; arg.u.symlink = path; - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); if (!status) @@ -1818,7 +1815,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, }; int status; - fattr.valid = 0; + nfs_fattr_init(&fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); if (!status) { @@ -1916,7 +1913,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, int status; int mode = sattr->ia_mode; - fattr.valid = 0; + nfs_fattr_init(&fattr); BUG_ON(!(sattr->ia_valid & ATTR_MODE)); BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); @@ -1969,7 +1966,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = fsstat, }; - fsstat->fattr->valid = 0; + nfs_fattr_init(fsstat->fattr); return rpc_call_sync(server->client, &msg, 0); } @@ -2016,7 +2013,7 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { - fsinfo->fattr->valid = 0; + nfs_fattr_init(fsinfo->fattr); return nfs4_do_fsinfo(server, fhandle, fsinfo); } @@ -2039,7 +2036,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle return 0; } - pathconf->fattr->valid = 0; + nfs_fattr_init(pathconf->fattr); return rpc_call_sync(server->client, &msg, 0); } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 8fef86523d7f..5ef28f08f424 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -61,7 +61,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("%s: call getattr\n", __FUNCTION__); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0); dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); if (status) @@ -93,7 +93,7 @@ nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call getattr\n"); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call(server->client, NFSPROC_GETATTR, fhandle, fattr, 0); dprintk("NFS reply getattr: %d\n", status); @@ -112,7 +112,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; dprintk("NFS call setattr\n"); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); if (status == 0) nfs_setattr_update_inode(inode, sattr); @@ -136,7 +136,7 @@ nfs_proc_lookup(struct inode *dir, struct qstr *name, int status; dprintk("NFS call lookup %s\n", name->name); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(dir), NFSPROC_LOOKUP, &arg, &res, 0); dprintk("NFS reply lookup: %d\n", status); return status; @@ -174,7 +174,7 @@ static int nfs_proc_read(struct nfs_read_data *rdata) dprintk("NFS call read %d @ %Ld\n", rdata->args.count, (long long) rdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); if (status >= 0) { nfs_refresh_inode(inode, fattr); @@ -203,7 +203,7 @@ static int nfs_proc_write(struct nfs_write_data *wdata) dprintk("NFS call write %d @ %Ld\n", wdata->args.count, (long long) wdata->args.offset); - fattr->valid = 0; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); if (status >= 0) { nfs_refresh_inode(inode, fattr); @@ -232,7 +232,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, }; int status; - fattr.valid = 0; + nfs_fattr_init(&fattr); dprintk("NFS call create %s\n", dentry->d_name.name); status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); if (status == 0) @@ -273,12 +273,12 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ } - fattr.valid = 0; + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); if (status == -EINVAL && S_ISFIFO(mode)) { sattr->ia_mode = mode; - fattr.valid = 0; + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); } if (status == 0) @@ -391,7 +391,7 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, if (path->len > NFS2_MAXPATHLEN) return -ENAMETOOLONG; dprintk("NFS call symlink %s -> %s\n", name->name, path->name); - fattr->valid = 0; + nfs_fattr_init(fattr); fhandle->size = 0; status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0); dprintk("NFS reply symlink: %d\n", status); @@ -416,7 +416,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) int status; dprintk("NFS call mkdir %s\n", dentry->d_name.name); - fattr.valid = 0; + nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0); if (status == 0) status = nfs_instantiate(dentry, &fhandle, &fattr); @@ -484,7 +484,7 @@ nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call statfs\n"); - stat->fattr->valid = 0; + nfs_fattr_init(stat->fattr); status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); dprintk("NFS reply statfs: %d\n", status); if (status) @@ -507,7 +507,7 @@ nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, int status; dprintk("NFS call fsinfo\n"); - info->fattr->valid = 0; + nfs_fattr_init(info->fattr); status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); dprintk("NFS reply fsinfo: %d\n", status); if (status) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 9758ebd49905..43b03b19731b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -215,6 +215,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.fattr = &data->fattr; data->res.count = count; data->res.eof = 0; + nfs_fattr_init(&data->fattr); NFS_PROTO(inode)->read_setup(data); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5130eda231d7..819a65f5071f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -870,6 +870,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, data->res.fattr = &data->fattr; data->res.count = count; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); NFS_PROTO(inode)->write_setup(data, how); @@ -1237,6 +1238,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); NFS_PROTO(inode)->commit_setup(data, how); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 7bac2785c6e4..8120fd68dee5 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -316,6 +316,11 @@ extern void nfs_file_clear_open_context(struct file *filp); /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern u32 root_nfs_parse_addr(char *name); /*__init*/ +static inline void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; +} + /* * linux/fs/nfs/file.c */ From 913a70fc170530f7e1ff0693595155457cc6d0ca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:38 -0400 Subject: [PATCH 084/126] NFS: Convert cache_change_attribute into a jiffy-based value Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 8 ++++---- include/linux/nfs_fs.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 65d5ab45ddc5..449df8c8aa31 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1135,7 +1135,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) * We may need to keep the attributes marked as invalid if * we raced with nfs_end_attr_update(). */ - if (verifier == nfsi->cache_change_attribute) + if (time_after_eq(verifier, nfsi->cache_change_attribute)) nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); @@ -1202,7 +1202,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) if (S_ISDIR(inode->i_mode)) { memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); /* This ensures we revalidate child dentries */ - nfsi->cache_change_attribute++; + nfsi->cache_change_attribute = jiffies; } spin_unlock(&inode->i_lock); @@ -1242,7 +1242,7 @@ void nfs_end_data_update(struct inode *inode) nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); } - nfsi->cache_change_attribute ++; + nfsi->cache_change_attribute = jiffies; atomic_dec(&nfsi->data_updates); } @@ -1391,7 +1391,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign /* Do we perhaps have any outstanding writes? */ if (nfsi->npages == 0) { /* No, but did we race with nfs_end_data_update()? */ - if (verifier == nfsi->cache_change_attribute) { + if (time_after_eq(verifier, nfsi->cache_change_attribute)) { inode->i_size = new_isize; invalid |= NFS_INO_INVALID_DATA; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8120fd68dee5..abf890f5fbfb 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -280,7 +280,7 @@ static inline long nfs_save_change_attribute(struct inode *inode) static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr) { return !nfs_caches_unstable(inode) - && chattr == NFS_I(inode)->cache_change_attribute; + && time_after_eq(chattr, NFS_I(inode)->cache_change_attribute); } /* From 33801147a8fda6b04d7e9afe1d42f1c01d3d6837 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:39 -0400 Subject: [PATCH 085/126] NFS: Optimise inode attribute cache updates Allow nfs_refresh_inode() also to update attributes on the inode if the RPC call was sent after the last call to nfs_update_inode(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 54 ++++++++++++++++++++++++++++++----------- fs/nfs/nfs2xdr.c | 1 - fs/nfs/nfs3xdr.c | 1 - fs/nfs/nfs4xdr.c | 4 +-- include/linux/nfs_fs.h | 2 ++ include/linux/nfs_xdr.h | 2 +- 6 files changed, 44 insertions(+), 20 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 449df8c8aa31..b7d4f8f13ac2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -785,7 +785,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else init_special_inode(inode, inode->i_mode, fattr->rdev); - nfsi->read_cache_jiffies = fattr->timestamp; + nfsi->read_cache_jiffies = fattr->time_start; + nfsi->last_updated = jiffies; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -1120,14 +1121,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } + spin_lock(&inode->i_lock); status = nfs_update_inode(inode, &fattr, verifier); if (status) { + spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } - spin_lock(&inode->i_lock); cache_validity = nfsi->cache_validity; nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; @@ -1247,7 +1249,7 @@ void nfs_end_data_update(struct inode *inode) } /** - * nfs_refresh_inode - verify consistency of the inode attribute cache + * nfs_check_inode_attributes - verify consistency of the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes * @@ -1255,13 +1257,12 @@ void nfs_end_data_update(struct inode *inode) * so that fattr carries weak cache consistency data, then it may * also update the ctime/mtime/change_attribute. */ -int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; int data_unstable; - spin_lock(&inode->i_lock); /* Are we in the process of updating data on the server? */ data_unstable = nfs_caches_unstable(inode); @@ -1325,11 +1326,40 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) if (!timespec_equal(&inode->i_atime, &fattr->atime)) nfsi->cache_validity |= NFS_INO_INVALID_ATIME; - nfsi->read_cache_jiffies = fattr->timestamp; - spin_unlock(&inode->i_lock); + nfsi->read_cache_jiffies = fattr->time_start; return 0; } +/** + * nfs_refresh_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Check that an RPC call that returned attributes has not overlapped with + * other recent updates of the inode metadata, then decide whether it is + * safe to do a full update of the inode attributes, or whether just to + * call nfs_check_inode_attributes. + */ +int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int status; + + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + spin_lock(&inode->i_lock); + nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; + if (nfs_verify_change_attribute(inode, fattr->time_start)) + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); + if (time_after(fattr->time_start, nfsi->last_updated)) + status = nfs_update_inode(inode, fattr, fattr->time_start); + else + status = nfs_check_inode_attributes(inode, fattr); + + spin_unlock(&inode->i_lock); + return status; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1365,20 +1395,17 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign goto out_err; } - spin_lock(&inode->i_lock); - /* * Make sure the inode's type hasn't changed. */ - if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { - spin_unlock(&inode->i_lock); + if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; - } /* * Update the read time so we don't revalidate too often. */ - nfsi->read_cache_jiffies = fattr->timestamp; + nfsi->read_cache_jiffies = fattr->time_start; + nfsi->last_updated = jiffies; /* Are we racing with known updates of the metadata on the server? */ data_unstable = ! (nfs_verify_change_attribute(inode, verifier) || @@ -1467,7 +1494,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign if (!nfs_have_delegation(inode, FMODE_READ)) nfsi->cache_validity |= invalid; - spin_unlock(&inode->i_lock); return 0; out_changed: /* diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index d91b69044a4d..59049e864ca7 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -143,7 +143,6 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } - fattr->timestamp = jiffies; return p; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index db4a904810a4..0498bd36602c 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -174,7 +174,6 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) /* Update the mode bits */ fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); - fattr->timestamp = jiffies; return p; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index cd762648fa9a..8b21de8a06fa 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2799,10 +2799,8 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) goto xdr_error; - if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) { + if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; - fattr->timestamp = jiffies; - } xdr_error: if (status != 0) printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index abf890f5fbfb..faeaad666ca8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -141,6 +141,7 @@ struct nfs_inode { unsigned long attrtimeo_timestamp; __u64 change_attr; /* v4 only */ + unsigned long last_updated; /* "Generation counter" for the attribute cache. This is * bumped whenever we update the metadata on the * server. @@ -319,6 +320,7 @@ extern u32 root_nfs_parse_addr(char *name); /*__init*/ static inline void nfs_fattr_init(struct nfs_fattr *fattr) { fattr->valid = 0; + fattr->time_start = jiffies; } /* diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 60086dac11d5..aeaee7e7c51d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -41,7 +41,7 @@ struct nfs_fattr { __u32 bitmap[2]; /* NFSv4 returned attribute bitmap */ __u64 change_attr; /* NFSv4 change attribute */ __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ - unsigned long timestamp; + unsigned long time_start; }; #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ From decf491f3076190262d4c649bed877650623903a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:39 -0400 Subject: [PATCH 086/126] NFS: Don't let nfs_end_data_update() clobber attribute update information Since we almost always call nfs_end_data_update() after we called nfs_refresh_inode(), we now end up marking the inode metadata as needing revalidation immediately after having updated it. This patch rearranges things so that we mark the inode as needing revalidation _before_ we call nfs_refresh_inode() on those operations that need it. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 38 ++++++++++++++++++++++++++++++++------ fs/nfs/nfs3proc.c | 30 +++++++++++++++--------------- fs/nfs/nfs4proc.c | 3 +++ fs/nfs/proc.c | 16 +++++++++++++--- include/linux/nfs_fs.h | 15 ++++++++++----- 5 files changed, 73 insertions(+), 29 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b7d4f8f13ac2..6b3156e15350 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1236,13 +1236,12 @@ void nfs_end_data_update(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); if (!nfs_have_delegation(inode, FMODE_READ)) { - /* Mark the attribute cache for revalidation */ - spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; - /* Directories and symlinks: invalidate page cache too */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + /* Directories and symlinks: invalidate page cache */ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); + } } nfsi->cache_change_attribute = jiffies; atomic_dec(&nfsi->data_updates); @@ -1360,6 +1359,33 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) return status; } +/** + * nfs_post_op_update_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. + */ +int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int status = 0; + + spin_lock(&inode->i_lock); + if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) { + nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; + goto out; + } + status = nfs_update_inode(inode, fattr, fattr->time_start); + if (time_after_eq(fattr->time_start, nfsi->cache_change_attribute)) + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); + nfsi->cache_change_attribute = jiffies; +out: + spin_unlock(&inode->i_lock); + return status; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 4b1b48b139f6..92c870d19ccd 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -266,7 +266,7 @@ static int nfs3_proc_write(struct nfs_write_data *wdata) nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags); if (status >= 0) - nfs_refresh_inode(inode, fattr); + nfs_post_op_update_inode(inode, fattr); dprintk("NFS reply write: %d\n", status); return status < 0? status : wdata->res.count; } @@ -288,7 +288,7 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata) nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status >= 0) - nfs_refresh_inode(inode, fattr); + nfs_post_op_update_inode(inode, fattr); dprintk("NFS reply commit: %d\n", status); return status; } @@ -332,7 +332,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, &dir_attr); /* If the server doesn't support the exclusive creation semantics, * try again with simple 'guarded' mode. */ @@ -403,7 +403,7 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name) dprintk("NFS call remove %s\n", name->name); nfs_fattr_init(&dir_attr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, &dir_attr); dprintk("NFS reply remove: %d\n", status); return status; } @@ -439,7 +439,7 @@ nfs3_proc_unlink_done(struct dentry *dir, struct rpc_task *task) return 1; if (msg->rpc_argp) { dir_attr = (struct nfs_fattr*)msg->rpc_resp; - nfs_refresh_inode(dir->d_inode, dir_attr); + nfs_post_op_update_inode(dir->d_inode, dir_attr); kfree(msg->rpc_argp); } return 0; @@ -468,8 +468,8 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, nfs_fattr_init(&old_dir_attr); nfs_fattr_init(&new_dir_attr); status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0); - nfs_refresh_inode(old_dir, &old_dir_attr); - nfs_refresh_inode(new_dir, &new_dir_attr); + nfs_post_op_update_inode(old_dir, &old_dir_attr); + nfs_post_op_update_inode(new_dir, &new_dir_attr); dprintk("NFS reply rename: %d\n", status); return status; } @@ -494,8 +494,8 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0); - nfs_refresh_inode(dir, &dir_attr); - nfs_refresh_inode(inode, &fattr); + nfs_post_op_update_inode(dir, &dir_attr); + nfs_post_op_update_inode(inode, &fattr); dprintk("NFS reply link: %d\n", status); return status; } @@ -527,7 +527,7 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, nfs_fattr_init(&dir_attr); nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, &dir_attr); dprintk("NFS reply symlink: %d\n", status); return status; } @@ -558,7 +558,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, &dir_attr); if (status != 0) goto out; status = nfs_instantiate(dentry, &fhandle, &fattr); @@ -584,7 +584,7 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name) dprintk("NFS call rmdir %s\n", name->name); nfs_fattr_init(&dir_attr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, &dir_attr); dprintk("NFS reply rmdir: %d\n", status); return status; } @@ -679,7 +679,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_fattr_init(&dir_attr); nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, &dir_attr); if (status != 0) goto out; status = nfs_instantiate(dentry, &fh, &fattr); @@ -775,7 +775,7 @@ nfs3_write_done(struct rpc_task *task) return; data = (struct nfs_write_data *)task->tk_calldata; if (task->tk_status >= 0) - nfs_refresh_inode(data->inode, data->res.fattr); + nfs_post_op_update_inode(data->inode, data->res.fattr); nfs_writeback_done(task); } @@ -819,7 +819,7 @@ nfs3_commit_done(struct rpc_task *task) return; data = (struct nfs_write_data *)task->tk_calldata; if (task->tk_status >= 0) - nfs_refresh_inode(data->inode, data->res.fattr); + nfs_post_op_update_inode(data->inode, data->res.fattr); nfs_commit_done(task); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2a759e8e387c..3274f2d354f3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -187,8 +187,11 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf { struct nfs_inode *nfsi = NFS_I(inode); + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (cinfo->before == nfsi->change_attr && cinfo->atomic) nfsi->change_attr = cinfo->after; + spin_unlock(&inode->i_lock); } /* Helper for asynchronous RPC calls */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 5ef28f08f424..a48a003242c0 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -206,7 +206,7 @@ static int nfs_proc_write(struct nfs_write_data *wdata) nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); if (status >= 0) { - nfs_refresh_inode(inode, fattr); + nfs_post_op_update_inode(inode, fattr); wdata->res.count = wdata->args.count; wdata->verf.committed = NFS_FILE_SYNC; } @@ -275,6 +275,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); + nfs_mark_for_revalidate(dir); if (status == -EINVAL && S_ISFIFO(mode)) { sattr->ia_mode = mode; @@ -305,6 +306,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name) dprintk("NFS call remove %s\n", name->name); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); dprintk("NFS reply remove: %d\n", status); return status; @@ -331,8 +333,10 @@ nfs_proc_unlink_done(struct dentry *dir, struct rpc_task *task) { struct rpc_message *msg = &task->tk_msg; - if (msg->rpc_argp) + if (msg->rpc_argp) { + nfs_mark_for_revalidate(dir->d_inode); kfree(msg->rpc_argp); + } return 0; } @@ -352,6 +356,8 @@ nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); status = rpc_call(NFS_CLIENT(old_dir), NFSPROC_RENAME, &arg, NULL, 0); + nfs_mark_for_revalidate(old_dir); + nfs_mark_for_revalidate(new_dir); dprintk("NFS reply rename: %d\n", status); return status; } @@ -369,6 +375,7 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) dprintk("NFS call link %s\n", name->name); status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0); + nfs_mark_for_revalidate(dir); dprintk("NFS reply link: %d\n", status); return status; } @@ -394,6 +401,7 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, nfs_fattr_init(fattr); fhandle->size = 0; status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0); + nfs_mark_for_revalidate(dir); dprintk("NFS reply symlink: %d\n", status); return status; } @@ -418,6 +426,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) dprintk("NFS call mkdir %s\n", dentry->d_name.name); nfs_fattr_init(&fattr); status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0); + nfs_mark_for_revalidate(dir); if (status == 0) status = nfs_instantiate(dentry, &fhandle, &fattr); dprintk("NFS reply mkdir: %d\n", status); @@ -436,6 +445,7 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name) dprintk("NFS call rmdir %s\n", name->name); status = rpc_call(NFS_CLIENT(dir), NFSPROC_RMDIR, &arg, NULL, 0); + nfs_mark_for_revalidate(dir); dprintk("NFS reply rmdir: %d\n", status); return status; } @@ -579,7 +589,7 @@ nfs_write_done(struct rpc_task *task) struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; if (task->tk_status >= 0) - nfs_refresh_inode(data->inode, data->res.fattr); + nfs_post_op_update_inode(data->inode, data->res.fattr); nfs_writeback_done(task); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index faeaad666ca8..325fe7ae49bb 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -241,13 +241,17 @@ static inline int nfs_caches_unstable(struct inode *inode) return atomic_read(&NFS_I(inode)->data_updates) != 0; } +static inline void nfs_mark_for_revalidate(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; + spin_unlock(&inode->i_lock); +} + static inline void NFS_CACHEINV(struct inode *inode) { - if (!nfs_caches_unstable(inode)) { - spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; - spin_unlock(&inode->i_lock); - } + if (!nfs_caches_unstable(inode)) + nfs_mark_for_revalidate(inode); } static inline int nfs_server_capable(struct inode *inode, int cap) @@ -291,6 +295,7 @@ extern void nfs_zap_caches(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, struct nfs_fattr *); extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); +extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int nfs_permission(struct inode *, int, struct nameidata *); extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *); From 0c70b50150cfb0b43ff500a8a394a52b4d5f1350 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2005 11:48:36 -0400 Subject: [PATCH 087/126] NFS: nfs_lookup doesn't need to revalidate the parent directory's inode nfs_lookup() used to consult a lookup cache before trying an actual wire lookup operation. The lookup cache would be invalid, of course, if the parent directory's mtime had changed, so nfs_lookup performed an inode revalidation on the parent. Since nfs_lookup() doesn't use a cache anymore, the revalidation is no longer necessary. There are cases where it will generate a lot of unnecessary GETATTR traffic. See http://bugzilla.linux-nfs.org/show_bug.cgi?id=9 Test-plan: Use lndir and "rm -rf" and watch for excess GETATTR traffic or application level errors. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b8a73045e9a0..ce8f77dadff9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -853,12 +853,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru dentry->d_op = NFS_PROTO(dir)->dentry_ops; lock_kernel(); - /* Revalidate parent directory attribute cache */ - error = nfs_revalidate_inode(NFS_SERVER(dir), dir); - if (error < 0) { - res = ERR_PTR(error); - goto out_unlock; - } /* If we're doing an exclusive create, optimize away the lookup */ if (nfs_is_exclusive_create(dir, nd)) From 56ae19f38f10aad4f27f7e12138a29b295dff07a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:40 -0400 Subject: [PATCH 088/126] NFSv4: Add directory post-op attributes to the CREATE operations. Since the directory attributes change every time we CREATE a file, we might as well pick up the new directory attributes in the same compound. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 28 ++++++++++++++--- fs/nfs/nfs4xdr.c | 69 +++++++++++++++++++++++++++++++++++------ include/linux/nfs_xdr.h | 2 ++ 3 files changed, 84 insertions(+), 15 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3274f2d354f3..f363fd6c7f4d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -443,7 +443,11 @@ static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, stru nfs_increment_open_seqid(status, o_arg->seqid); if (status != 0) goto out; - update_changeattr(dir, &o_res->cinfo); + if (o_arg->open_flags & O_CREAT) { + update_changeattr(dir, &o_res->cinfo); + nfs_post_op_update_inode(dir, o_res->dir_attr); + } else + nfs_refresh_inode(dir, o_res->dir_attr); if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { status = _nfs4_proc_open_confirm(server->client, &o_res->fh, sp, &o_res->stateid, o_arg->seqid); @@ -497,7 +501,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(dir); struct nfs_delegation *delegation = NFS_I(inode)->delegation; - struct nfs_fattr f_attr; + struct nfs_fattr f_attr, dir_attr; struct nfs_openargs o_arg = { .fh = NFS_FH(dir), .open_flags = state->state, @@ -507,6 +511,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st }; struct nfs_openres o_res = { .f_attr = &f_attr, + .dir_attr = &dir_attr, .server = server, }; int status = 0; @@ -524,6 +529,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st if (o_arg.seqid == NULL) goto out; nfs_fattr_init(&f_attr); + nfs_fattr_init(&dir_attr); status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_nodeleg; @@ -694,7 +700,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st struct nfs4_client *clp = server->nfs4_state; struct inode *inode = NULL; int status; - struct nfs_fattr f_attr; + struct nfs_fattr f_attr, dir_attr; struct nfs_openargs o_arg = { .fh = NFS_FH(dir), .open_flags = flags, @@ -705,6 +711,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st }; struct nfs_openres o_res = { .f_attr = &f_attr, + .dir_attr = &dir_attr, .server = server, }; @@ -727,6 +734,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st if (o_arg.seqid == NULL) return -ENOMEM; nfs_fattr_init(&f_attr); + nfs_fattr_init(&dir_attr); status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_err; @@ -1746,6 +1754,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); + struct nfs_fattr dir_fattr; struct nfs4_create_arg arg = { .dir_fh = NFS_FH(dir), .server = server, @@ -1758,6 +1767,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, .server = server, .fh = fhandle, .fattr = fattr, + .dir_fattr = &dir_fattr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK], @@ -1770,10 +1780,12 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, return -ENAMETOOLONG; arg.u.symlink = path; nfs_fattr_init(fattr); + nfs_fattr_init(&dir_fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); if (!status) update_changeattr(dir, &res.dir_cinfo); + nfs_post_op_update_inode(dir, res.dir_fattr); return status; } @@ -1797,7 +1809,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, { struct nfs_server *server = NFS_SERVER(dir); struct nfs_fh fhandle; - struct nfs_fattr fattr; + struct nfs_fattr fattr, dir_fattr; struct nfs4_create_arg arg = { .dir_fh = NFS_FH(dir), .server = server, @@ -1810,6 +1822,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, .server = server, .fh = &fhandle, .fattr = &fattr, + .dir_fattr = &dir_fattr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], @@ -1819,10 +1832,12 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, int status; nfs_fattr_init(&fattr); + nfs_fattr_init(&dir_fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); if (!status) { update_changeattr(dir, &res.dir_cinfo); + nfs_post_op_update_inode(dir, res.dir_fattr); status = nfs_instantiate(dentry, &fhandle, &fattr); } return status; @@ -1895,7 +1910,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, { struct nfs_server *server = NFS_SERVER(dir); struct nfs_fh fh; - struct nfs_fattr fattr; + struct nfs_fattr fattr, dir_fattr; struct nfs4_create_arg arg = { .dir_fh = NFS_FH(dir), .server = server, @@ -1907,6 +1922,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, .server = server, .fh = &fh, .fattr = &fattr, + .dir_fattr = &dir_fattr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], @@ -1917,6 +1933,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, int mode = sattr->ia_mode; nfs_fattr_init(&fattr); + nfs_fattr_init(&dir_fattr); BUG_ON(!(sattr->ia_valid & ATTR_MODE)); BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); @@ -1938,6 +1955,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); if (status == 0) { update_changeattr(dir, &res.dir_cinfo); + nfs_post_op_update_inode(dir, res.dir_fattr); status = nfs_instantiate(dentry, &fh, &fattr); } return status; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8b21de8a06fa..7f91d613d31a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -95,6 +95,8 @@ static int nfs_stat_to_errno(int); #define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) +#define encode_restorefh_maxsz (op_encode_hdr_maxsz) +#define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) @@ -336,14 +338,20 @@ static int nfs_stat_to_errno(int); decode_getfh_maxsz) #define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ + encode_savefh_maxsz + \ encode_create_maxsz + \ + encode_getfh_maxsz + \ encode_getattr_maxsz + \ - encode_getfh_maxsz) + encode_restorefh_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ + decode_savefh_maxsz + \ decode_create_maxsz + \ + decode_getfh_maxsz + \ decode_getattr_maxsz + \ - decode_getfh_maxsz) + decode_restorefh_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz) @@ -1112,6 +1120,17 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client return 0; } +static int +encode_restorefh(struct xdr_stream *xdr) +{ + uint32_t *p; + + RESERVE_SPACE(4); + WRITE32(OP_RESTOREFH); + + return 0; +} + static int encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) { @@ -1358,7 +1377,7 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct n { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 4, + .nops = 7, }; int status; @@ -1366,10 +1385,16 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct n encode_compound_hdr(&xdr, &hdr); if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) goto out; + if ((status = encode_savefh(&xdr)) != 0) + goto out; if ((status = encode_create(&xdr, args)) != 0) goto out; if ((status = encode_getfh(&xdr)) != 0) goto out; + if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) + goto out; + if ((status = encode_restorefh(&xdr)) != 0) + goto out; status = encode_getfattr(&xdr, args->bitmask); out: return status; @@ -1429,7 +1454,7 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 4, + .nops = 7, }; int status; @@ -1439,6 +1464,9 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); + if (status) + goto out; + status = encode_savefh(&xdr); if (status) goto out; status = encode_open(&xdr, args); @@ -1448,6 +1476,12 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena if (status) goto out; status = encode_getfattr(&xdr, args->bitmask); + if (status) + goto out; + status = encode_restorefh(&xdr); + if (status) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -3218,6 +3252,12 @@ static int decode_renew(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_RENEW); } +static int +decode_restorefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RESTOREFH); +} + static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, size_t *acl_len) { @@ -3510,13 +3550,17 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ goto out; if ((status = decode_putfh(&xdr)) != 0) goto out; + if ((status = decode_savefh(&xdr)) != 0) + goto out; if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); - if (status == NFS4ERR_DELAY) - status = 0; + if (decode_getfattr(&xdr, res->fattr, res->server) != 0) + goto out; + if ((status = decode_restorefh(&xdr)) != 0) + goto out; + decode_getfattr(&xdr, res->dir_fattr, res->server); out: return status; } @@ -3654,15 +3698,20 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_ope status = decode_putfh(&xdr); if (status) goto out; + status = decode_savefh(&xdr); + if (status) + goto out; status = decode_open(&xdr, res); if (status) goto out; status = decode_getfh(&xdr, &res->fh); if (status) goto out; - status = decode_getfattr(&xdr, res->f_attr, res->server); - if (status == NFS4ERR_DELAY) - status = 0; + if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) + goto out; + if ((status = decode_restorefh(&xdr)) != 0) + goto out; + decode_getfattr(&xdr, res->dir_attr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index aeaee7e7c51d..6485b8b41b83 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -124,6 +124,7 @@ struct nfs_openres { struct nfs4_change_info cinfo; __u32 rflags; struct nfs_fattr * f_attr; + struct nfs_fattr * dir_attr; const struct nfs_server *server; int delegation_type; nfs4_stateid delegation; @@ -540,6 +541,7 @@ struct nfs4_create_res { struct nfs_fh * fh; struct nfs_fattr * fattr; struct nfs4_change_info dir_cinfo; + struct nfs_fattr * dir_fattr; }; struct nfs4_fsinfo_arg { From 3338c143b4fde2d256016b63043ec8e2c93eba19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:41 -0400 Subject: [PATCH 089/126] NFS: Optimise attribute revalidation on close(). Only force a getattr in nfs_file_flush() if the attribute cache is stale. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 572d8593486f..57d3e77d97ee 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -205,8 +205,8 @@ nfs_file_flush(struct file *file) if (!status) { status = ctx->error; ctx->error = 0; - if (!status && !nfs_have_delegation(inode, FMODE_READ)) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!status) + nfs_revalidate_inode(NFS_SERVER(inode), inode); } unlock_kernel(); return status; From 516a6af641bb50c608329a5bd751acd0d65cc4ab Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:41 -0400 Subject: [PATCH 090/126] NFS: Add optional post-op getattr instruction to the NFSv4 file close. "Optional" means that the close call will not fail if the getattr at the end of the compound fails. If it does succeed, try to refresh inode attributes. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- fs/nfs/nfs4xdr.c | 34 ++++++++++++++++++++++++++++------ include/linux/nfs_xdr.h | 3 +++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f363fd6c7f4d..7be3d2d15d6f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -865,6 +865,7 @@ struct nfs4_closedata { struct nfs4_state *state; struct nfs_closeargs arg; struct nfs_closeres res; + struct nfs_fattr fattr; }; static void nfs4_free_closedata(struct nfs4_closedata *calldata) @@ -904,6 +905,7 @@ static void nfs4_close_done(struct rpc_task *task) return; } } + nfs_refresh_inode(calldata->inode, calldata->res.fattr); state->state = calldata->arg.open_flags; nfs4_free_closedata(calldata); } @@ -941,6 +943,7 @@ static void nfs4_close_begin(struct rpc_task *task) rpc_exit(task, 0); return; } + nfs_fattr_init(calldata->res.fattr); if (mode != 0) msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; calldata->arg.open_flags = mode; @@ -960,6 +963,7 @@ static void nfs4_close_begin(struct rpc_task *task) */ int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) { + struct nfs_server *server = NFS_SERVER(inode); struct nfs4_closedata *calldata; int status = -ENOMEM; @@ -974,8 +978,11 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); if (calldata->arg.seqid == NULL) goto out_free_calldata; + calldata->arg.bitmask = server->attr_bitmask; + calldata->res.fattr = &calldata->fattr; + calldata->res.server = server; - status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_close_begin, + status = nfs4_call_async(server->client, nfs4_close_begin, nfs4_close_done, calldata); if (status == 0) goto out; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7f91d613d31a..cd9e26cfa868 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -198,17 +198,21 @@ static int nfs_stat_to_errno(int); #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 7) + op_encode_hdr_maxsz + 7 + \ + encode_getattr_maxsz) #define NFS4_dec_open_downgrade_sz \ (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4) + op_decode_hdr_maxsz + 4 + \ + decode_getattr_maxsz) #define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 5) + op_encode_hdr_maxsz + 5 + \ + encode_getattr_maxsz) #define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4) + op_decode_hdr_maxsz + 4 + \ + decode_getattr_maxsz) #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ op_encode_hdr_maxsz + 4 + \ @@ -1433,7 +1437,7 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; @@ -1443,6 +1447,9 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos if(status) goto out; status = encode_close(&xdr, args); + if (status != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -1541,7 +1548,7 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; @@ -1551,6 +1558,9 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct if (status) goto out; status = encode_open_downgrade(&xdr, args); + if (status != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -3403,6 +3413,9 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, stru if (status) goto out; status = decode_open_downgrade(&xdr, res); + if (status != 0) + goto out; + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } @@ -3678,6 +3691,15 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_cl if (status) goto out; status = decode_close(&xdr, res); + if (status != 0) + goto out; + /* + * Note: Server may do delete on close for this file + * in which case the getattr call will fail with + * an ESTALE error. Shouldn't be a problem, + * though, since fattr->valid will remain unset. + */ + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6485b8b41b83..4f03dc21cf4a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -153,10 +153,13 @@ struct nfs_closeargs { nfs4_stateid * stateid; struct nfs_seqid * seqid; int open_flags; + const u32 * bitmask; }; struct nfs_closeres { nfs4_stateid stateid; + struct nfs_fattr * fattr; + const struct nfs_server *server; }; /* * * Arguments to the lock,lockt, and locku call. From cf809556149f076b7a020c10e066b2b96e79b6a1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:42 -0400 Subject: [PATCH 091/126] NFS: Ensure that nfs_link() instantiates the dentry correctly Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce8f77dadff9..8272ed3fc707 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1432,17 +1432,14 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); - /* - * Drop the dentry in advance to force a new lookup. - * Since nfs_proc_link doesn't return a file handle, - * we can't use the existing dentry. - */ lock_kernel(); - d_drop(dentry); - nfs_begin_data_update(dir); nfs_begin_data_update(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); + if (error == 0) { + atomic_inc(&inode->i_count); + d_instantiate(dentry, inode); + } nfs_end_data_update(inode); nfs_end_data_update(dir); unlock_kernel(); From 91ba2eeec5e8e86e054937eb3bf5aec5b22b1830 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:42 -0400 Subject: [PATCH 092/126] NFSv4: Add post-op attributes to nfs4_proc_link() Optimise attribute revalidation when hardlinking. Add post-op attributes for the directory and the original inode. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 22 +++++++++++++++++----- fs/nfs/nfs4xdr.c | 34 ++++++++++++++++++++++++++++------ include/linux/nfs_xdr.h | 9 +++++++++ 3 files changed, 54 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7be3d2d15d6f..04995e39e867 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1724,22 +1724,34 @@ static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { + struct nfs_server *server = NFS_SERVER(inode); struct nfs4_link_arg arg = { .fh = NFS_FH(inode), .dir_fh = NFS_FH(dir), .name = name, + .bitmask = server->attr_bitmask, + }; + struct nfs_fattr fattr, dir_attr; + struct nfs4_link_res res = { + .server = server, + .fattr = &fattr, + .dir_attr = &dir_attr, }; - struct nfs4_change_info cinfo = { }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], .rpc_argp = &arg, - .rpc_resp = &cinfo, + .rpc_resp = &res, }; int status; - status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - if (!status) - update_changeattr(dir, &cinfo); + nfs_fattr_init(res.fattr); + nfs_fattr_init(res.dir_attr); + status = rpc_call_sync(server->client, &msg, 0); + if (!status) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_refresh_inode(inode, res.fattr); + } return status; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index cd9e26cfa868..f624b693ce21 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -324,12 +324,18 @@ static int nfs_stat_to_errno(int); encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_link_maxsz) + encode_link_maxsz + \ + decode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + decode_getattr_maxsz) #define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_link_maxsz) + decode_link_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_symlink_maxsz + \ @@ -1357,7 +1363,7 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 4, + .nops = 7, }; int status; @@ -1369,7 +1375,13 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs goto out; if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) goto out; - status = encode_link(&xdr, args->name); + if ((status = encode_link(&xdr, args->name)) != 0) + goto out; + if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) + goto out; + if ((status = encode_restorefh(&xdr)) != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -3529,7 +3541,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ /* * Decode LINK response */ -static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_change_info *cinfo) +static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -3544,7 +3556,17 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ch goto out; if ((status = decode_putfh(&xdr)) != 0) goto out; - status = decode_link(&xdr, cinfo); + if ((status = decode_link(&xdr, &res->cinfo)) != 0) + goto out; + /* + * Note order: OP_LINK leaves the directory as the current + * filehandle. + */ + if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) + goto out; + if ((status = decode_restorefh(&xdr)) != 0) + goto out; + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 4f03dc21cf4a..89238b799cfd 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -566,8 +566,17 @@ struct nfs4_link_arg { const struct nfs_fh * fh; const struct nfs_fh * dir_fh; const struct qstr * name; + const u32 * bitmask; }; +struct nfs4_link_res { + const struct nfs_server * server; + struct nfs_fattr * fattr; + struct nfs4_change_info cinfo; + struct nfs_fattr * dir_attr; +}; + + struct nfs4_lookup_arg { const struct nfs_fh * dir_fh; const struct qstr * name; From 6caf2c8276d371679a798058e8fdf49f5ff831a3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:43 -0400 Subject: [PATCH 093/126] NFSv4: Add post-op attributes to nfs4_proc_rename() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 +++++++++++++-- fs/nfs/nfs4xdr.c | 29 ++++++++++++++++++++++++----- include/linux/nfs_xdr.h | 4 ++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 04995e39e867..f96bc12c0fa0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1685,13 +1685,20 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task) static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { + struct nfs_server *server = NFS_SERVER(old_dir); struct nfs4_rename_arg arg = { .old_dir = NFS_FH(old_dir), .new_dir = NFS_FH(new_dir), .old_name = old_name, .new_name = new_name, + .bitmask = server->attr_bitmask, + }; + struct nfs_fattr old_fattr, new_fattr; + struct nfs4_rename_res res = { + .server = server, + .old_fattr = &old_fattr, + .new_fattr = &new_fattr, }; - struct nfs4_rename_res res = { }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], .rpc_argp = &arg, @@ -1699,11 +1706,15 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, }; int status; - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); + nfs_fattr_init(res.old_fattr); + nfs_fattr_init(res.new_fattr); + status = rpc_call_sync(server->client, &msg, 0); if (!status) { update_changeattr(old_dir, &res.old_cinfo); + nfs_post_op_update_inode(old_dir, res.old_fattr); update_changeattr(new_dir, &res.new_cinfo); + nfs_post_op_update_inode(new_dir, res.new_fattr); } return status; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f624b693ce21..2a07755bd347 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -314,12 +314,18 @@ static int nfs_stat_to_errno(int); encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_rename_maxsz) + encode_rename_maxsz + \ + encode_getattr_maxsz + \ + encode_restorefh_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_rename_maxsz) + decode_rename_maxsz + \ + decode_getattr_maxsz + \ + decode_restorefh_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ @@ -1339,7 +1345,7 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct n { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 4, + .nops = 7, }; int status; @@ -1351,7 +1357,13 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct n goto out; if ((status = encode_putfh(&xdr, args->new_dir)) != 0) goto out; - status = encode_rename(&xdr, args->old_name, args->new_name); + if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0) + goto out; + if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) + goto out; + if ((status = encode_restorefh(&xdr)) != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -3533,7 +3545,14 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ goto out; if ((status = decode_putfh(&xdr)) != 0) goto out; - status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo); + if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) + goto out; + /* Current FH is target directory */ + if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) + goto out; + if ((status = decode_restorefh(&xdr)) != 0) + goto out; + decode_getfattr(&xdr, res->old_fattr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 89238b799cfd..6f0804280824 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -630,11 +630,15 @@ struct nfs4_rename_arg { const struct nfs_fh * new_dir; const struct qstr * old_name; const struct qstr * new_name; + const u32 * bitmask; }; struct nfs4_rename_res { + const struct nfs_server * server; struct nfs4_change_info old_cinfo; + struct nfs_fattr * old_fattr; struct nfs4_change_info new_cinfo; + struct nfs_fattr * new_fattr; }; struct nfs4_setclientid { From 16e429596dec4d28e16812b3a9be27f18412c567 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:44 -0400 Subject: [PATCH 094/126] NFSv4: Add post-op attributes to nfs4_proc_remove() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 27 +++++++++++++++++++++------ fs/nfs/nfs4xdr.c | 25 +++++++++++++++++-------- include/linux/nfs_xdr.h | 7 +++++++ 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f96bc12c0fa0..bab47c4cb41c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1614,11 +1614,17 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_remove_arg args = { .fh = NFS_FH(dir), .name = name, + .bitmask = server->attr_bitmask, + }; + struct nfs_fattr dir_attr; + struct nfs4_remove_res res = { + .server = server, + .dir_attr = &dir_attr, }; - struct nfs4_change_info res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE], .rpc_argp = &args, @@ -1626,9 +1632,12 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) }; int status; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (status == 0) - update_changeattr(dir, &res); + nfs_fattr_init(res.dir_attr); + status = rpc_call_sync(server->client, &msg, 0); + if (status == 0) { + update_changeattr(dir, &res.cinfo); + nfs_post_op_update_inode(dir, res.dir_attr); + } return status; } @@ -1646,12 +1655,14 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name) struct unlink_desc { struct nfs4_remove_arg args; - struct nfs4_change_info res; + struct nfs4_remove_res res; + struct nfs_fattr dir_attr; }; static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *name) { + struct nfs_server *server = NFS_SERVER(dir->d_inode); struct unlink_desc *up; up = (struct unlink_desc *) kmalloc(sizeof(*up), GFP_KERNEL); @@ -1660,6 +1671,9 @@ static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, up->args.fh = NFS_FH(dir->d_inode); up->args.name = name; + up->args.bitmask = server->attr_bitmask; + up->res.server = server; + up->res.dir_attr = &up->dir_attr; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; msg->rpc_argp = &up->args; @@ -1674,7 +1688,8 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task) if (msg->rpc_resp != NULL) { up = container_of(msg->rpc_resp, struct unlink_desc, res); - update_changeattr(dir->d_inode, &up->res); + update_changeattr(dir->d_inode, &up->res.cinfo); + nfs_post_op_update_inode(dir->d_inode, up->res.dir_attr); kfree(up); msg->rpc_resp = NULL; msg->rpc_argp = NULL; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2a07755bd347..3ee3a1669d28 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -306,10 +306,12 @@ static int nfs_stat_to_errno(int); decode_getfh_maxsz) #define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_remove_maxsz) + encode_remove_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 5) + op_decode_hdr_maxsz + 5 + \ + decode_getattr_maxsz) #define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ @@ -1327,14 +1329,18 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct n { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) == 0) - status = encode_remove(&xdr, args->name); + if ((status = encode_putfh(&xdr, args->fh)) != 0) + goto out; + if ((status = encode_remove(&xdr, args->name)) != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); +out: return status; } @@ -3512,7 +3518,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, uint32_t *p, struct /* * Decode REMOVE response */ -static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_change_info *cinfo) +static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -3521,8 +3527,11 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; - if ((status = decode_putfh(&xdr)) == 0) - status = decode_remove(&xdr, cinfo); + if ((status = decode_putfh(&xdr)) != 0) + goto out; + if ((status = decode_remove(&xdr, &res->cinfo)) != 0) + goto out; + decode_getfattr(&xdr, res->dir_attr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6f0804280824..deeba7e2c518 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -623,6 +623,13 @@ struct nfs4_readlink { struct nfs4_remove_arg { const struct nfs_fh * fh; const struct qstr * name; + const u32 * bitmask; +}; + +struct nfs4_remove_res { + const struct nfs_server * server; + struct nfs4_change_info cinfo; + struct nfs_fattr * dir_attr; }; struct nfs4_rename_arg { From 4f9838c7ecd14f31f701f64fa65ded132fc0db8a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:44 -0400 Subject: [PATCH 095/126] NFSv4: Add post-op attributes to NFSv4 write and commit callbacks. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 13 ++++++++++++- fs/nfs/nfs4xdr.c | 28 ++++++++++++++++++++++------ include/linux/nfs_xdr.h | 2 ++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bab47c4cb41c..933e13b383f8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2169,8 +2169,10 @@ nfs4_write_done(struct rpc_task *task) rpc_restart_call(task); return; } - if (task->tk_status >= 0) + if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), data->timestamp); + nfs_post_op_update_inode(inode, data->res.fattr); + } /* Call back common NFS writeback processing */ nfs_writeback_done(task); } @@ -2186,6 +2188,7 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how) .rpc_cred = data->cred, }; struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); int stable; int flags; @@ -2197,6 +2200,8 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how) } else stable = NFS_UNSTABLE; data->args.stable = stable; + data->args.bitmask = server->attr_bitmask; + data->res.server = server; data->timestamp = jiffies; @@ -2218,6 +2223,8 @@ nfs4_commit_done(struct rpc_task *task) rpc_restart_call(task); return; } + if (task->tk_status >= 0) + nfs_post_op_update_inode(inode, data->res.fattr); /* Call back common NFS writeback processing */ nfs_commit_done(task); } @@ -2233,8 +2240,12 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how) .rpc_cred = data->cred, }; struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); int flags; + data->args.bitmask = server->attr_bitmask; + data->res.server = server; + /* Set the initial flags for the task. */ flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3ee3a1669d28..6f1bf182e0e0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -159,16 +159,20 @@ static int nfs_stat_to_errno(int); op_decode_hdr_maxsz + 2) #define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 8) + op_encode_hdr_maxsz + 8 + \ + encode_getattr_maxsz) #define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4) + op_decode_hdr_maxsz + 4 + \ + decode_getattr_maxsz) #define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 3) + op_encode_hdr_maxsz + 3 + \ + encode_getattr_maxsz) #define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 2) + op_decode_hdr_maxsz + 2 + \ + decode_getattr_maxsz) #define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ op_encode_hdr_maxsz + \ @@ -1799,7 +1803,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writ { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; @@ -1809,6 +1813,9 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writ if (status) goto out; status = encode_write(&xdr, args); + if (status) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -1820,7 +1827,7 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_wri { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; @@ -1830,6 +1837,9 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_wri if (status) goto out; status = encode_commit(&xdr, args); + if (status) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -4001,6 +4011,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_wr if (status) goto out; status = decode_write(&xdr, res); + if (status) + goto out; + decode_getfattr(&xdr, res->fattr, res->server); if (!status) status = res->count; out: @@ -4024,6 +4037,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_w if (status) goto out; status = decode_commit(&xdr, res); + if (status) + goto out; + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index deeba7e2c518..40718669b9c8 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -256,6 +256,7 @@ struct nfs_writeargs { enum nfs3_stable_how stable; unsigned int pgbase; struct page ** pages; + const u32 * bitmask; }; struct nfs_writeverf { @@ -267,6 +268,7 @@ struct nfs_writeres { struct nfs_fattr * fattr; struct nfs_writeverf * verf; __u32 count; + const struct nfs_server *server; }; /* From 16c32b71bc53d6f7143702ebb264b4ef20d8f1e5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:45 -0400 Subject: [PATCH 096/126] NFSv4: Convert unnecessary XDR warning messages into dprintk() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 6f1bf182e0e0..fbbace8a30c4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2766,8 +2766,7 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: - if (status != 0) - printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); return status; } @@ -2800,8 +2799,7 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) status = verify_attr_len(xdr, savep, attrlen); xdr_error: - if (status != 0) - printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); return status; } @@ -2826,8 +2824,7 @@ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf status = verify_attr_len(xdr, savep, attrlen); xdr_error: - if (status != 0) - printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); return status; } @@ -2886,8 +2883,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; xdr_error: - if (status != 0) - printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d\n", __FUNCTION__, -status); return status; } @@ -2920,8 +2916,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = verify_attr_len(xdr, savep, attrlen); xdr_error: - if (status != 0) - printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); return status; } @@ -3088,7 +3083,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p += bmlen; return decode_delegation(xdr, res); xdr_error: - printk(KERN_NOTICE "%s: xdr error!\n", __FUNCTION__); + dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen); return -EIO; } From bec273b491bd16351a9cdb8e9a51d30afa7fe9f4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:45 -0400 Subject: [PATCH 097/126] NFS: Allow files that are open for write to invalidate caches Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6b3156e15350..f2781ca42761 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1057,15 +1057,11 @@ int nfs_open(struct inode *inode, struct file *filp) ctx->mode = filp->f_mode; nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); - if ((filp->f_mode & FMODE_WRITE) != 0) - nfs_begin_data_update(inode); return 0; } int nfs_release(struct inode *inode, struct file *filp) { - if ((filp->f_mode & FMODE_WRITE) != 0) - nfs_end_data_update(inode); nfs_file_clear_open_context(filp); return 0; } From 6070fe6f82c67021367092855c0812a98becf0fa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:46 -0400 Subject: [PATCH 098/126] RPC: Ensure that nobody can queue up new upcalls after rpc_close_pipes() Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11ec..4f188d0a5d11 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -76,25 +76,35 @@ int rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) { struct rpc_inode *rpci = RPC_I(inode); - int res = 0; + int res = -EPIPE; down(&inode->i_sem); + if (rpci->ops == NULL) + goto out; if (rpci->nreaders) { list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; + res = 0; } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { if (list_empty(&rpci->pipe)) schedule_delayed_work(&rpci->queue_timeout, RPC_UPCALL_TIMEOUT); list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; - } else - res = -EPIPE; + res = 0; + } +out: up(&inode->i_sem); wake_up(&rpci->waitq); return res; } +static inline void +rpc_inode_setowner(struct inode *inode, void *private) +{ + RPC_I(inode)->private = private; +} + static void rpc_close_pipes(struct inode *inode) { @@ -111,15 +121,10 @@ rpc_close_pipes(struct inode *inode) rpci->ops->release_pipe(inode); rpci->ops = NULL; } + rpc_inode_setowner(inode, NULL); up(&inode->i_sem); } -static inline void -rpc_inode_setowner(struct inode *inode, void *private) -{ - RPC_I(inode)->private = private; -} - static struct inode * rpc_alloc_inode(struct super_block *sb) { @@ -501,7 +506,6 @@ rpc_depopulate(struct dentry *parent) dentry = dvec[--n]; if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); simple_unlink(dir, dentry); } dput(dentry); @@ -576,10 +580,8 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry) int error; shrink_dcache_parent(dentry); - if (dentry->d_inode) { + if (dentry->d_inode) rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); - } if ((error = simple_rmdir(dir, dentry)) != 0) return error; if (!error) { @@ -732,7 +734,6 @@ rpc_unlink(char *path) d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); error = simple_unlink(dir, dentry); } dput(dentry); From af4ca457eaf2d6682059c18463eb106e2ce58198 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 02:55:38 -0400 Subject: [PATCH 099/126] [PATCH] gfp_t: infrastructure Beginning of gfp_t annotations: - -Wbitwise added to CHECKFLAGS - old __bitwise renamed to __bitwise__ - __bitwise defined to either __bitwise__ or nothing, depending on __CHECK_ENDIAN__ being defined - gfp_t switched from __nocast to __bitwise__ - force cast to gfp_t added to __GFP_... constants - new helper - gfp_zone(); extracts zone bits out of gfp_t value and casts the result to int Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- Makefile | 2 +- fs/buffer.c | 2 +- include/linux/gfp.h | 39 ++++++++++++++++++++------------------- include/linux/types.h | 9 +++++++-- mm/mempolicy.c | 6 +++--- mm/page_alloc.c | 4 ++-- 6 files changed, 34 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 1fa7e5343464..f1d121f23025 100644 --- a/Makefile +++ b/Makefile @@ -334,7 +334,7 @@ KALLSYMS = scripts/kallsyms PERL = perl CHECK = sparse -CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ $(CF) +CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise $(CF) MODFLAGS = -DMODULE CFLAGS_MODULE = $(MODFLAGS) AFLAGS_MODULE = $(MODFLAGS) diff --git a/fs/buffer.c b/fs/buffer.c index 1216c0d3c8ce..9657696fd6d7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -502,7 +502,7 @@ static void free_more_memory(void) yield(); for_each_pgdat(pgdat) { - zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones; + zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; if (*zones) try_to_free_pages(zones, GFP_NOFS); } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3010e172394d..c3779432a723 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -12,8 +12,8 @@ struct vm_area_struct; * GFP bitmasks.. */ /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ -#define __GFP_DMA 0x01u -#define __GFP_HIGHMEM 0x02u +#define __GFP_DMA ((__force gfp_t)0x01u) +#define __GFP_HIGHMEM ((__force gfp_t)0x02u) /* * Action modifiers - doesn't change the zoning @@ -26,24 +26,24 @@ struct vm_area_struct; * * __GFP_NORETRY: The VM implementation must not retry indefinitely. */ -#define __GFP_WAIT 0x10u /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20u /* Should access emergency pools? */ -#define __GFP_IO 0x40u /* Can start physical IO? */ -#define __GFP_FS 0x80u /* Can call down to low-level FS? */ -#define __GFP_COLD 0x100u /* Cache-cold page required */ -#define __GFP_NOWARN 0x200u /* Suppress page allocation failure warning */ -#define __GFP_REPEAT 0x400u /* Retry the allocation. Might fail */ -#define __GFP_NOFAIL 0x800u /* Retry for ever. Cannot fail */ -#define __GFP_NORETRY 0x1000u /* Do not retry. Might fail */ -#define __GFP_NO_GROW 0x2000u /* Slab internal usage */ -#define __GFP_COMP 0x4000u /* Add compound page metadata */ -#define __GFP_ZERO 0x8000u /* Return zeroed page on success */ -#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */ -#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */ -#define __GFP_HARDWALL 0x40000u /* Enforce hardwall cpuset memory allocs */ +#define __GFP_WAIT ((__force gfp_t)0x10u) /* Can wait and reschedule? */ +#define __GFP_HIGH ((__force gfp_t)0x20u) /* Should access emergency pools? */ +#define __GFP_IO ((__force gfp_t)0x40u) /* Can start physical IO? */ +#define __GFP_FS ((__force gfp_t)0x80u) /* Can call down to low-level FS? */ +#define __GFP_COLD ((__force gfp_t)0x100u) /* Cache-cold page required */ +#define __GFP_NOWARN ((__force gfp_t)0x200u) /* Suppress page allocation failure warning */ +#define __GFP_REPEAT ((__force gfp_t)0x400u) /* Retry the allocation. Might fail */ +#define __GFP_NOFAIL ((__force gfp_t)0x800u) /* Retry for ever. Cannot fail */ +#define __GFP_NORETRY ((__force gfp_t)0x1000u)/* Do not retry. Might fail */ +#define __GFP_NO_GROW ((__force gfp_t)0x2000u)/* Slab internal usage */ +#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */ +#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ +#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ +#define __GFP_NORECLAIM ((__force gfp_t)0x20000u) /* No realy zone reclaim during allocation */ +#define __GFP_HARDWALL ((__force gfp_t)0x40000u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ -#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) +#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ @@ -64,6 +64,7 @@ struct vm_area_struct; #define GFP_DMA __GFP_DMA +#define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK)) /* * There is only one page-allocator function, and two main namespaces to @@ -94,7 +95,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return NULL; return __alloc_pages(gfp_mask, order, - NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); + NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)); } #ifdef CONFIG_NUMA diff --git a/include/linux/types.h b/include/linux/types.h index 0aee34f9da9f..21b9ce803644 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -151,7 +151,12 @@ typedef unsigned long sector_t; */ #ifdef __CHECKER__ -#define __bitwise __attribute__((bitwise)) +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#ifdef __CHECK_ENDIAN__ +#define __bitwise __bitwise__ #else #define __bitwise #endif @@ -166,7 +171,7 @@ typedef __u64 __bitwise __be64; #endif #ifdef __KERNEL__ -typedef unsigned __nocast gfp_t; +typedef unsigned __bitwise__ gfp_t; #endif struct ustat { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37af443eb094..1d5c64df1653 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -700,7 +700,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) case MPOL_BIND: /* Lower zones don't get a policy applied */ /* Careful: current->mems_allowed might have moved */ - if ((gfp & GFP_ZONEMASK) >= policy_zone) + if (gfp_zone(gfp) >= policy_zone) if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) return policy->v.zonelist; /*FALL THROUGH*/ @@ -712,7 +712,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) nd = 0; BUG(); } - return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); + return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); } /* Do dynamic interleaving for a process */ @@ -757,7 +757,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned ni struct page *page; BUG_ON(!node_online(nid)); - zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); + zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1d3d77f4aee..aa43ae3ab8c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1089,7 +1089,7 @@ static unsigned int nr_free_zone_pages(int offset) */ unsigned int nr_free_buffer_pages(void) { - return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); + return nr_free_zone_pages(gfp_zone(GFP_USER)); } /* @@ -1097,7 +1097,7 @@ unsigned int nr_free_buffer_pages(void) */ unsigned int nr_free_pagecache_pages(void) { - return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); } #ifdef CONFIG_HIGHMEM From 6daa0e28627abf362138244a620a821a9027d816 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:18:50 -0400 Subject: [PATCH 100/126] [PATCH] gfp_t: mm/* (easy parts) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/pagemap.h | 6 +++--- include/linux/slab.h | 2 +- include/linux/swap.h | 4 ++-- mm/filemap.c | 8 ++++---- mm/mempool.c | 2 +- mm/shmem.c | 4 ++-- mm/slab.c | 8 ++++---- mm/vmscan.c | 8 ++++---- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 097b3a3c693d..e1649578fb0c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,7 +747,7 @@ extern unsigned long do_mremap(unsigned long addr, * The callback will be passed nr_to_scan == 0 when the VM is querying the * cache size, so a fastpath for that case is appropriate. */ -typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); +typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask); /* * Add an aging callback. The int is the number of 'seeks' it takes diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index acbf31c154f8..efbae53fb078 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -69,7 +69,7 @@ extern struct page * find_lock_page(struct address_space *mapping, extern struct page * find_trylock_page(struct address_space *mapping, unsigned long index); extern struct page * find_or_create_page(struct address_space *mapping, - unsigned long index, unsigned int gfp_mask); + unsigned long index, gfp_t gfp_mask); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, @@ -92,9 +92,9 @@ extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); int add_to_page_cache(struct page *page, struct address_space *mapping, - unsigned long index, int gfp_mask); + unsigned long index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - unsigned long index, int gfp_mask); + unsigned long index, gfp_t gfp_mask); extern void remove_from_page_cache(struct page *page); extern void __remove_from_page_cache(struct page *page); diff --git a/include/linux/slab.h b/include/linux/slab.h index 5fc04a16ecb0..09b9aa60063d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -121,7 +121,7 @@ extern unsigned int ksize(const void *); extern void *kmem_cache_alloc_node(kmem_cache_t *, gfp_t flags, int node); extern void *kmalloc_node(size_t size, gfp_t flags, int node); #else -static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int node) +static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int node) { return kmem_cache_alloc(cachep, flags); } diff --git a/include/linux/swap.h b/include/linux/swap.h index a7bf1a3b1496..20c975642cab 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -171,8 +171,8 @@ extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, unsigned int); -extern int zone_reclaim(struct zone *, unsigned int, unsigned int); +extern int try_to_free_pages(struct zone **, gfp_t); +extern int zone_reclaim(struct zone *, gfp_t, unsigned int); extern int shrink_all_memory(int); extern int vm_swappiness; diff --git a/mm/filemap.c b/mm/filemap.c index b5346576e58d..1c31b2fd2ca5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -377,7 +377,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, * This function does not add the page to the LRU. The caller must do that. */ int add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t offset, int gfp_mask) + pgoff_t offset, gfp_t gfp_mask) { int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); @@ -401,7 +401,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, EXPORT_SYMBOL(add_to_page_cache); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, int gfp_mask) + pgoff_t offset, gfp_t gfp_mask) { int ret = add_to_page_cache(page, mapping, offset, gfp_mask); if (ret == 0) @@ -591,7 +591,7 @@ EXPORT_SYMBOL(find_lock_page); * memory exhaustion. */ struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, unsigned int gfp_mask) + unsigned long index, gfp_t gfp_mask) { struct page *page, *cached_page = NULL; int err; @@ -683,7 +683,7 @@ struct page * grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { struct page *page = find_get_page(mapping, index); - unsigned int gfp_mask; + gfp_t gfp_mask; if (page) { if (!TestSetPageLocked(page)) diff --git a/mm/mempool.c b/mm/mempool.c index 9e377ea700b2..1a99b80480d3 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -205,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) void *element; unsigned long flags; wait_queue_t wait; - unsigned int gfp_temp; + gfp_t gfp_temp; might_sleep_if(gfp_mask & __GFP_WAIT); diff --git a/mm/shmem.c b/mm/shmem.c index ea064d89cda9..55e04a0734c1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -85,7 +85,7 @@ enum sgp_type { static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp, int *type); -static inline struct page *shmem_dir_alloc(unsigned int gfp_mask) +static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) { /* * The above definition of ENTRIES_PER_PAGE, and the use of @@ -898,7 +898,7 @@ struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, } static struct page * -shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info, +shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { struct vm_area_struct pvma; diff --git a/mm/slab.c b/mm/slab.c index d05c678bceb3..d30423f167a2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -386,7 +386,7 @@ struct kmem_cache_s { unsigned int gfporder; /* force GFP flags, e.g. GFP_DMA */ - unsigned int gfpflags; + gfp_t gfpflags; size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ @@ -2117,7 +2117,7 @@ static void cache_init_objs(kmem_cache_t *cachep, slabp->free = 0; } -static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags) +static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) { if (flags & SLAB_DMA) { if (!(cachep->gfpflags & GFP_DMA)) @@ -2152,7 +2152,7 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) struct slab *slabp; void *objp; size_t offset; - unsigned int local_flags; + gfp_t local_flags; unsigned long ctor_flags; struct kmem_list3 *l3; @@ -2546,7 +2546,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) /* * A interface to enable slab creation on nodeid */ -static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) +static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) { struct list_head *entry; struct slab *slabp; diff --git a/mm/vmscan.c b/mm/vmscan.c index 64f9570cff56..843c87d1e61f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -70,7 +70,7 @@ struct scan_control { unsigned int priority; /* This context's GFP mask */ - unsigned int gfp_mask; + gfp_t gfp_mask; int may_writepage; @@ -186,7 +186,7 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, +static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) { struct shrinker *shrinker; @@ -926,7 +926,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, unsigned int gfp_mask) +int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; @@ -1338,7 +1338,7 @@ module_init(kswapd_init) /* * Try to free up some pages from this zone through reclaim. */ -int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { struct scan_control sc; int nr_pages = 1 << order; From fd4f2df24bc23e6b8fc069765b425c7dacf52347 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:18:50 -0400 Subject: [PATCH 101/126] [PATCH] gfp_t: lib/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/idr.h | 2 +- include/linux/kobject.h | 2 +- include/linux/radix-tree.h | 2 +- include/linux/textsearch.h | 4 ++-- lib/idr.c | 2 +- lib/kobject.c | 2 +- lib/kobject_uevent.c | 4 ++-- lib/textsearch.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 3d5de45f961b..7fb3ff9c7b0e 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -71,7 +71,7 @@ struct idr { */ void *idr_find(struct idr *idp, int id); -int idr_pre_get(struct idr *idp, unsigned gfp_mask); +int idr_pre_get(struct idr *idp, gfp_t gfp_mask); int idr_get_new(struct idr *idp, void *ptr, int *id); int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id); void idr_remove(struct idr *idp, int id); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 3b22304f12fd..7f7403aa4a41 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -65,7 +65,7 @@ extern void kobject_unregister(struct kobject *); extern struct kobject * kobject_get(struct kobject *); extern void kobject_put(struct kobject *); -extern char * kobject_get_path(struct kobject *, int); +extern char * kobject_get_path(struct kobject *, gfp_t); struct kobj_type { void (*release)(struct kobject *); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 045d4761febc..9f0f9281f42a 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -24,7 +24,7 @@ struct radix_tree_root { unsigned int height; - unsigned int gfp_mask; + gfp_t gfp_mask; struct radix_tree_node *rnode; }; diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h index 515046d1b2f4..fc5bb4e91a58 100644 --- a/include/linux/textsearch.h +++ b/include/linux/textsearch.h @@ -40,7 +40,7 @@ struct ts_state struct ts_ops { const char *name; - struct ts_config * (*init)(const void *, unsigned int, int); + struct ts_config * (*init)(const void *, unsigned int, gfp_t); unsigned int (*find)(struct ts_config *, struct ts_state *); void (*destroy)(struct ts_config *); @@ -148,7 +148,7 @@ static inline unsigned int textsearch_get_pattern_len(struct ts_config *conf) extern int textsearch_register(struct ts_ops *); extern int textsearch_unregister(struct ts_ops *); extern struct ts_config *textsearch_prepare(const char *, const void *, - unsigned int, int, int); + unsigned int, gfp_t, int); extern void textsearch_destroy(struct ts_config *conf); extern unsigned int textsearch_find_continuous(struct ts_config *, struct ts_state *, diff --git a/lib/idr.c b/lib/idr.c index d4df21debc4d..6414b2fb482d 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -72,7 +72,7 @@ static void free_layer(struct idr *idp, struct idr_layer *p) * If the system is REALLY out of memory this function returns 0, * otherwise 1. */ -int idr_pre_get(struct idr *idp, unsigned gfp_mask) +int idr_pre_get(struct idr *idp, gfp_t gfp_mask) { while (idp->id_free_cnt < IDR_FREE_MAX) { struct idr_layer *new; diff --git a/lib/kobject.c b/lib/kobject.c index dd0917dd9fa9..253d3004ace9 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -100,7 +100,7 @@ static void fill_kobj_path(struct kobject *kobj, char *path, int length) * @kobj: kobject in question, with which to build the path * @gfp_mask: the allocation type used to allocate the path */ -char *kobject_get_path(struct kobject *kobj, int gfp_mask) +char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask) { char *path; int len; diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 04ca4429ddfa..7ef6f6a17aa6 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -62,7 +62,7 @@ static struct sock *uevent_sock; * @gfp_mask: */ static int send_uevent(const char *signal, const char *obj, - char **envp, int gfp_mask) + char **envp, gfp_t gfp_mask) { struct sk_buff *skb; char *pos; @@ -98,7 +98,7 @@ static int send_uevent(const char *signal, const char *obj, } static int do_kobject_uevent(struct kobject *kobj, enum kobject_action action, - struct attribute *attr, int gfp_mask) + struct attribute *attr, gfp_t gfp_mask) { char *path; char *attrpath; diff --git a/lib/textsearch.c b/lib/textsearch.c index 1e934c196f0f..6f3093efbd7b 100644 --- a/lib/textsearch.c +++ b/lib/textsearch.c @@ -254,7 +254,7 @@ unsigned int textsearch_find_continuous(struct ts_config *conf, * parameters or a ERR_PTR(). */ struct ts_config *textsearch_prepare(const char *algo, const void *pattern, - unsigned int len, int gfp_mask, int flags) + unsigned int len, gfp_t gfp_mask, int flags) { int err = -ENOENT; struct ts_config *conf; From 7d877f3bda870ab5f001bd92528654471d5966b3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:20:43 -0400 Subject: [PATCH 102/126] [PATCH] gfp_t: net/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/security.h | 2 +- include/net/sock.h | 2 +- net/core/sock.c | 2 +- net/dccp/output.c | 2 +- net/netlink/af_netlink.c | 2 +- security/dummy.c | 2 +- security/selinux/hooks.c | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 627382e74057..dac956ed98f0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1210,7 +1210,7 @@ struct security_operations { int (*socket_shutdown) (struct socket * sock, int how); int (*socket_sock_rcv_skb) (struct sock * sk, struct sk_buff * skb); int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); - int (*sk_alloc_security) (struct sock *sk, int family, int priority); + int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); void (*sk_free_security) (struct sock *sk); #endif /* CONFIG_SECURITY_NETWORK */ }; diff --git a/include/net/sock.h b/include/net/sock.h index ecb75526cba0..e0498bd36004 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -207,7 +207,7 @@ struct sock { struct sk_buff_head sk_write_queue; int sk_wmem_queued; int sk_forward_alloc; - unsigned int sk_allocation; + gfp_t sk_allocation; int sk_sndbuf; int sk_route_caps; unsigned long sk_flags; diff --git a/net/core/sock.c b/net/core/sock.c index 1c52fe809eda..9602ceb3bac9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -940,7 +940,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, int noblock, int *errcode) { struct sk_buff *skb; - unsigned int gfp_mask; + gfp_t gfp_mask; long timeo; int err; diff --git a/net/dccp/output.c b/net/dccp/output.c index 29250749f16f..d59f86f7ceab 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -495,7 +495,7 @@ void dccp_send_close(struct sock *sk, const int active) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC; + const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; skb = alloc_skb(sk->sk_prot->max_header, prio); if (skb == NULL) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 678c3f2c0d0b..291df2e4c492 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -827,7 +827,7 @@ struct netlink_broadcast_data { int failure; int congested; int delivered; - unsigned int allocation; + gfp_t allocation; struct sk_buff *skb, *skb2; }; diff --git a/security/dummy.c b/security/dummy.c index 9623a61dfc76..3d34f3de7e82 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -768,7 +768,7 @@ static int dummy_socket_getpeersec(struct socket *sock, char __user *optval, return -ENOPROTOOPT; } -static inline int dummy_sk_alloc_security (struct sock *sk, int family, int priority) +static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t priority) { return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b13be15165f5..447a1e0f48cb 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -262,7 +262,7 @@ static void superblock_free_security(struct super_block *sb) } #ifdef CONFIG_SECURITY_NETWORK -static int sk_alloc_security(struct sock *sk, int family, int priority) +static int sk_alloc_security(struct sock *sk, int family, gfp_t priority) { struct sk_security_struct *ssec; @@ -3380,7 +3380,7 @@ static int selinux_socket_getpeersec(struct socket *sock, char __user *optval, return err; } -static int selinux_sk_alloc_security(struct sock *sk, int family, int priority) +static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) { return sk_alloc_security(sk, family, priority); } From 27496a8c67bef4d789d8e3c8317ca35813a507ae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:20:48 -0400 Subject: [PATCH 103/126] [PATCH] gfp_t: fs/* - ->releasepage() annotated (s/int/gfp_t), instances updated - missing gfp_t in fs/* added - fixed misannotation from the original sweep caught by bitwise checks: XFS used __nocast both for gfp_t and for flags used by XFS allocator. The latter left with unsigned int __nocast; we might want to add a different type for those but for now let's leave them alone. That, BTW, is a case when __nocast use had been actively confusing - it had been used in the same code for two different and similar types, with no way to catch misuses. Switch of gfp_t to bitwise had caught that immediately... One tricky bit is left alone to be dealt with later - mapping->flags is a mix of gfp_t and error indications. Left alone for now. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/afs/file.c | 4 ++-- fs/bio.c | 4 ++-- fs/buffer.c | 2 +- fs/dcache.c | 2 +- fs/dquot.c | 2 +- fs/ext3/inode.c | 2 +- fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/inode.c | 2 +- fs/jbd/journal.c | 2 +- fs/jbd/transaction.c | 2 +- fs/jfs/jfs_metapage.c | 4 ++-- fs/mbcache.c | 6 +++--- fs/reiserfs/fix_node.c | 2 +- fs/reiserfs/inode.c | 2 +- fs/xfs/linux-2.6/kmem.c | 22 +++++++++++----------- fs/xfs/linux-2.6/kmem.h | 18 +++++++++--------- fs/xfs/linux-2.6/xfs_aops.c | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 8 ++++---- include/linux/bio.h | 2 +- include/linux/buffer_head.h | 2 +- include/linux/fs.h | 2 +- include/linux/jbd.h | 4 ++-- include/linux/mbcache.h | 2 +- include/linux/reiserfs_fs.h | 2 +- 25 files changed, 52 insertions(+), 52 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index 23c125128024..0d576987ec67 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -29,7 +29,7 @@ static int afs_file_release(struct inode *inode, struct file *file); static int afs_file_readpage(struct file *file, struct page *page); static int afs_file_invalidatepage(struct page *page, unsigned long offset); -static int afs_file_releasepage(struct page *page, int gfp_flags); +static int afs_file_releasepage(struct page *page, gfp_t gfp_flags); static ssize_t afs_file_write(struct file *file, const char __user *buf, size_t size, loff_t *off); @@ -279,7 +279,7 @@ static int afs_file_invalidatepage(struct page *page, unsigned long offset) /* * release a page and cleanup its private data */ -static int afs_file_releasepage(struct page *page, int gfp_flags) +static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) { struct cachefs_page *pageio; diff --git a/fs/bio.c b/fs/bio.c index 7d81a93afd48..460554b07ff9 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -778,7 +778,7 @@ static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err) static struct bio *__bio_map_kern(request_queue_t *q, void *data, - unsigned int len, unsigned int gfp_mask) + unsigned int len, gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -825,7 +825,7 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data, * device. Returns an error pointer in case of error. */ struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len, - unsigned int gfp_mask) + gfp_t gfp_mask) { struct bio *bio; diff --git a/fs/buffer.c b/fs/buffer.c index 9657696fd6d7..b1667986442f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1571,7 +1571,7 @@ static inline void discard_buffer(struct buffer_head * bh) * * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ -int try_to_release_page(struct page *page, int gfp_mask) +int try_to_release_page(struct page *page, gfp_t gfp_mask) { struct address_space * const mapping = page->mapping; diff --git a/fs/dcache.c b/fs/dcache.c index fb10386c59be..e90512ed35a4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -689,7 +689,7 @@ void shrink_dcache_anon(struct hlist_head *head) * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, unsigned int gfp_mask) +static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/dquot.c b/fs/dquot.c index b9732335bcdc..05f3327d64a3 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -500,7 +500,7 @@ static void prune_dqcache(int count) * more memory */ -static int shrink_dqcache_memory(int nr, unsigned int gfp_mask) +static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b5177c90d6f1..8b38f2232796 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1434,7 +1434,7 @@ static int ext3_invalidatepage(struct page *page, unsigned long offset) return journal_invalidatepage(journal, page, offset); } -static int ext3_releasepage(struct page *page, int wait) +static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f1570b9f9de3..3f680c5675bf 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -46,7 +46,7 @@ static sector_t hfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, hfs_get_block); } -static int hfs_releasepage(struct page *page, int mask) +static int hfs_releasepage(struct page *page, gfp_t mask) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index d5642705f633..f205773ddfbe 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -40,7 +40,7 @@ static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, hfsplus_get_block); } -static int hfsplus_releasepage(struct page *page, int mask) +static int hfsplus_releasepage(struct page *page, gfp_t mask) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; diff --git a/fs/inode.c b/fs/inode.c index f80a79ff156b..7d3316527767 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -475,7 +475,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, unsigned int gfp_mask) +static int shrink_icache_memory(int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 7ae2c4fe506b..e4b516ac4989 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1606,7 +1606,7 @@ int journal_blocks_per_page(struct inode *inode) * Simple support for retrying memory allocations. Introduced to help to * debug different VM deadlock avoidance strategies. */ -void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry) +void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) { return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 49bbc2be3d72..13cb05bf6048 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1621,7 +1621,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) * while the data is part of a transaction. Yes? */ int journal_try_to_free_buffers(journal_t *journal, - struct page *page, int unused_gfp_mask) + struct page *page, gfp_t unused_gfp_mask) { struct buffer_head *head; struct buffer_head *bh; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 13d7e3f1feb4..eeb37d70e650 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -198,7 +198,7 @@ static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) } } -static inline struct metapage *alloc_metapage(unsigned int gfp_mask) +static inline struct metapage *alloc_metapage(gfp_t gfp_mask) { return mempool_alloc(metapage_mempool, gfp_mask); } @@ -534,7 +534,7 @@ static int metapage_readpage(struct file *fp, struct page *page) return -EIO; } -static int metapage_releasepage(struct page *page, int gfp_mask) +static int metapage_releasepage(struct page *page, gfp_t gfp_mask) { struct metapage *mp; int busy = 0; diff --git a/fs/mbcache.c b/fs/mbcache.c index b002a088857d..298997f17475 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -116,7 +116,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask); +static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); static inline int @@ -140,7 +140,7 @@ __mb_cache_entry_unhash(struct mb_cache_entry *ce) static inline void -__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) +__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) { struct mb_cache *cache = ce->e_cache; @@ -193,7 +193,7 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce) * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask) +mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 2706e2adffab..45829889dcdc 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2022,7 +2022,7 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h) } #ifdef CONFIG_REISERFS_CHECK -void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s) +void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s) { void *vp; static size_t malloced; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d76ee6c4f9b8..5f82352b97e1 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2842,7 +2842,7 @@ static int reiserfs_set_page_dirty(struct page *page) * even in -o notail mode, we can't be sure an old mount without -o notail * didn't create files with tails. */ -static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) +static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) { struct inode *inode = page->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index d2653b589b1c..3c92162dc728 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -45,11 +45,11 @@ void * -kmem_alloc(size_t size, gfp_t flags) +kmem_alloc(size_t size, unsigned int __nocast flags) { - int retries = 0; - unsigned int lflags = kmem_flags_convert(flags); - void *ptr; + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; do { if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) @@ -67,7 +67,7 @@ kmem_alloc(size_t size, gfp_t flags) } void * -kmem_zalloc(size_t size, gfp_t flags) +kmem_zalloc(size_t size, unsigned int __nocast flags) { void *ptr; @@ -90,7 +90,7 @@ kmem_free(void *ptr, size_t size) void * kmem_realloc(void *ptr, size_t newsize, size_t oldsize, - gfp_t flags) + unsigned int __nocast flags) { void *new; @@ -105,11 +105,11 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize, } void * -kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags) +kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) { - int retries = 0; - unsigned int lflags = kmem_flags_convert(flags); - void *ptr; + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; do { ptr = kmem_cache_alloc(zone, lflags); @@ -124,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags) } void * -kmem_zone_zalloc(kmem_zone_t *zone, gfp_t flags) +kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) { void *ptr; diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index ee7010f085bc..f4bb78c268c0 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -81,9 +81,9 @@ typedef unsigned long xfs_pflags_t; *(NSTATEP) = *(OSTATEP); \ } while (0) -static __inline unsigned int kmem_flags_convert(gfp_t flags) +static __inline gfp_t kmem_flags_convert(unsigned int __nocast flags) { - unsigned int lflags = __GFP_NOWARN; /* we'll report problems, if need be */ + gfp_t lflags = __GFP_NOWARN; /* we'll report problems, if need be */ #ifdef DEBUG if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) { @@ -125,16 +125,16 @@ kmem_zone_destroy(kmem_zone_t *zone) BUG(); } -extern void *kmem_zone_zalloc(kmem_zone_t *, gfp_t); -extern void *kmem_zone_alloc(kmem_zone_t *, gfp_t); +extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); +extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); -extern void *kmem_alloc(size_t, gfp_t); -extern void *kmem_realloc(void *, size_t, size_t, gfp_t); -extern void *kmem_zalloc(size_t, gfp_t); +extern void *kmem_alloc(size_t, unsigned int __nocast); +extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); +extern void *kmem_zalloc(size_t, unsigned int __nocast); extern void kmem_free(void *, size_t); typedef struct shrinker *kmem_shaker_t; -typedef int (*kmem_shake_func_t)(int, unsigned int); +typedef int (*kmem_shake_func_t)(int, gfp_t); static __inline kmem_shaker_t kmem_shake_register(kmem_shake_func_t sfunc) @@ -149,7 +149,7 @@ kmem_shake_deregister(kmem_shaker_t shrinker) } static __inline int -kmem_shake_allow(unsigned int gfp_mask) +kmem_shake_allow(gfp_t gfp_mask) { return (gfp_mask & __GFP_WAIT); } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c6c077978fe3..7aa398724706 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1296,7 +1296,7 @@ linvfs_invalidate_page( STATIC int linvfs_release_page( struct page *page, - int gfp_mask) + gfp_t gfp_mask) { struct inode *inode = page->mapping->host; int dirty, delalloc, unmapped, unwritten; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e82cf72ac599..ba4767c04adf 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -64,7 +64,7 @@ STATIC kmem_cache_t *pagebuf_zone; STATIC kmem_shaker_t pagebuf_shake; -STATIC int xfsbufd_wakeup(int, unsigned int); +STATIC int xfsbufd_wakeup(int, gfp_t); STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); STATIC struct workqueue_struct *xfslogd_workqueue; @@ -383,7 +383,7 @@ _pagebuf_lookup_pages( size_t blocksize = bp->pb_target->pbr_bsize; size_t size = bp->pb_count_desired; size_t nbytes, offset; - int gfp_mask = pb_to_gfp(flags); + gfp_t gfp_mask = pb_to_gfp(flags); unsigned short page_count, i; pgoff_t first; loff_t end; @@ -1749,8 +1749,8 @@ STATIC int xfsbufd_force_sleep; STATIC int xfsbufd_wakeup( - int priority, - unsigned int mask) + int priority, + gfp_t mask) { if (xfsbufd_force_sleep) return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 3344b4e8e43a..685fd3720df5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -301,7 +301,7 @@ extern struct bio *bio_map_user_iov(struct request_queue *, struct sg_iovec *, int, int); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, - unsigned int); + gfp_t); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6a1d154c0825..88af42f5e04a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -188,7 +188,7 @@ extern int buffer_heads_over_limit; * Generic address_space_operations implementations for buffer_head-backed * address_spaces. */ -int try_to_release_page(struct page * page, int gfp_mask); +int try_to_release_page(struct page * page, gfp_t gfp_mask); int block_invalidatepage(struct page *page, unsigned long offset); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); diff --git a/include/linux/fs.h b/include/linux/fs.h index e0b77c5af9a0..f83d997c5582 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -320,7 +320,7 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); - int (*releasepage) (struct page *, int); + int (*releasepage) (struct page *, gfp_t); ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, loff_t offset, unsigned long nr_segs); struct page* (*get_xip_page)(struct address_space *, sector_t, diff --git a/include/linux/jbd.h b/include/linux/jbd.h index ff853b3173c6..be197eb90077 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -69,7 +69,7 @@ extern int journal_enable_debug; #define jbd_debug(f, a...) /**/ #endif -extern void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry); +extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry); #define jbd_kmalloc(size, flags) \ __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry) #define jbd_rep_kmalloc(size, flags) \ @@ -890,7 +890,7 @@ extern int journal_forget (handle_t *, struct buffer_head *); extern void journal_sync_buffer (struct buffer_head *); extern int journal_invalidatepage(journal_t *, struct page *, unsigned long); -extern int journal_try_to_free_buffers(journal_t *, struct page *, int); +extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); extern int journal_stop(handle_t *); extern int journal_flush (journal_t *); extern void journal_lock_updates (journal_t *); diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 9263d2db2d67..99e044b4efc6 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -22,7 +22,7 @@ struct mb_cache_entry { }; struct mb_cache_op { - int (*free)(struct mb_cache_entry *, int); + int (*free)(struct mb_cache_entry *, gfp_t); }; /* Functions on caches */ diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index af00b10294cd..001ab82df051 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1972,7 +1972,7 @@ extern struct address_space_operations reiserfs_address_space_operations; /* fix_nodes.c */ #ifdef CONFIG_REISERFS_CHECK -void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s); +void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s); void reiserfs_kfree(const void *vp, size_t size, struct super_block *s); #else static inline void *reiserfs_kmalloc(size_t size, int flags, From 8267e268e0914ac9371d07f711fcf20cc572993c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:20:53 -0400 Subject: [PATCH 104/126] [PATCH] gfp_t: block layer core Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/block/as-iosched.c | 2 +- drivers/block/cfq-iosched.c | 8 ++++---- drivers/block/deadline-iosched.c | 2 +- drivers/block/elevator.c | 2 +- drivers/block/ll_rw_blk.c | 16 ++++++++-------- include/linux/blkdev.h | 14 +++++++------- include/linux/elevator.h | 4 ++-- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c index 95c0a3690b0f..1f08e14697e9 100644 --- a/drivers/block/as-iosched.c +++ b/drivers/block/as-iosched.c @@ -1807,7 +1807,7 @@ static void as_put_request(request_queue_t *q, struct request *rq) } static int as_set_request(request_queue_t *q, struct request *rq, - struct bio *bio, int gfp_mask) + struct bio *bio, gfp_t gfp_mask) { struct as_data *ad = q->elevator->elevator_data; struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index cd056e7e64ec..d3bfe8cfb039 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -1422,7 +1422,7 @@ static void cfq_exit_io_context(struct cfq_io_context *cic) } static struct cfq_io_context * -cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask) +cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); @@ -1517,7 +1517,7 @@ static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio, - int gfp_mask) + gfp_t gfp_mask) { const int hashval = hash_long(key, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; @@ -1578,7 +1578,7 @@ cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio, * cfqq, so we don't need to worry about it disappearing */ static struct cfq_io_context * -cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask) +cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask) { struct io_context *ioc = NULL; struct cfq_io_context *cic; @@ -2075,7 +2075,7 @@ static void cfq_put_request(request_queue_t *q, struct request *rq) */ static int cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, - int gfp_mask) + gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c index 52a3ae5289a0..753546ba2262 100644 --- a/drivers/block/deadline-iosched.c +++ b/drivers/block/deadline-iosched.c @@ -756,7 +756,7 @@ static void deadline_put_request(request_queue_t *q, struct request *rq) static int deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio, - int gfp_mask) + gfp_t gfp_mask) { struct deadline_data *dd = q->elevator->elevator_data; struct deadline_rq *drq; diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 98f0126a2deb..c744d2a13062 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -487,7 +487,7 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq) } int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio, - int gfp_mask) + gfp_t gfp_mask) { elevator_t *e = q->elevator; diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index baedac522945..0f64ee7d8d27 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1652,13 +1652,13 @@ static int blk_init_free_list(request_queue_t *q) static int __make_request(request_queue_t *, struct bio *); -request_queue_t *blk_alloc_queue(int gfp_mask) +request_queue_t *blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask, -1); } EXPORT_SYMBOL(blk_alloc_queue); -request_queue_t *blk_alloc_queue_node(int gfp_mask, int node_id) +request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { request_queue_t *q; @@ -1787,7 +1787,7 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq) } static inline struct request * -blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask) +blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -1885,7 +1885,7 @@ static void freed_request(request_queue_t *q, int rw) * Returns !NULL on success, with queue_lock *not held*. */ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, - int gfp_mask) + gfp_t gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; @@ -2019,7 +2019,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw, return rq; } -struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) +struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask) { struct request *rq; @@ -2251,7 +2251,7 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * @gfp_mask: memory allocation flags */ int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf, - unsigned int len, unsigned int gfp_mask) + unsigned int len, gfp_t gfp_mask) { struct bio *bio; @@ -3393,7 +3393,7 @@ void exit_io_context(void) * but since the current task itself holds a reference, the context can be * used in general code, so long as it stays within `current` context. */ -struct io_context *current_io_context(int gfp_flags) +struct io_context *current_io_context(gfp_t gfp_flags) { struct task_struct *tsk = current; struct io_context *ret; @@ -3424,7 +3424,7 @@ EXPORT_SYMBOL(current_io_context); * * This is always called in the context of the task which submitted the I/O. */ -struct io_context *get_io_context(int gfp_flags) +struct io_context *get_io_context(gfp_t gfp_flags) { struct io_context *ret; ret = current_io_context(gfp_flags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index efdc9b5bc05c..1afbdb2d752c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -96,8 +96,8 @@ struct io_context { void put_io_context(struct io_context *ioc); void exit_io_context(void); -struct io_context *current_io_context(int gfp_flags); -struct io_context *get_io_context(int gfp_flags); +struct io_context *current_io_context(gfp_t gfp_flags); +struct io_context *get_io_context(gfp_t gfp_flags); void copy_io_context(struct io_context **pdst, struct io_context **psrc); void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); @@ -354,7 +354,7 @@ struct request_queue * queue needs bounce pages for pages above this limit */ unsigned long bounce_pfn; - unsigned int bounce_gfp; + gfp_t bounce_gfp; /* * various queue flags, see QUEUE_* below @@ -550,7 +550,7 @@ extern void generic_make_request(struct bio *bio); extern void blk_put_request(struct request *); extern void blk_end_sync_rq(struct request *rq); extern void blk_attempt_remerge(request_queue_t *, struct request *); -extern struct request *blk_get_request(request_queue_t *, int, int); +extern struct request *blk_get_request(request_queue_t *, int, gfp_t); extern void blk_insert_request(request_queue_t *, struct request *, int, void *); extern void blk_requeue_request(request_queue_t *, struct request *); extern void blk_plug_device(request_queue_t *); @@ -565,7 +565,7 @@ extern void blk_run_queue(request_queue_t *); extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *); extern int blk_rq_map_user(request_queue_t *, struct request *, void __user *, unsigned int); extern int blk_rq_unmap_user(struct bio *, unsigned int); -extern int blk_rq_map_kern(request_queue_t *, struct request *, void *, unsigned int, unsigned int); +extern int blk_rq_map_kern(request_queue_t *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_iovec *, int); extern int blk_execute_rq(request_queue_t *, struct gendisk *, struct request *, int); @@ -654,8 +654,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int); extern void blk_finish_queue_drain(request_queue_t *); int blk_get_queue(request_queue_t *); -request_queue_t *blk_alloc_queue(int gfp_mask); -request_queue_t *blk_alloc_queue_node(int,int); +request_queue_t *blk_alloc_queue(gfp_t); +request_queue_t *blk_alloc_queue_node(gfp_t, int); #define blk_put_queue(q) blk_cleanup_queue((q)) /* diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ea6bbc2d7407..ed93125c1db5 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -18,7 +18,7 @@ typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct re typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *); -typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int); +typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, gfp_t); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); @@ -98,7 +98,7 @@ extern int elv_register_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q); extern int elv_may_queue(request_queue_t *, int, struct bio *); extern void elv_completed_request(request_queue_t *, struct request *); -extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int); +extern int elv_set_request(request_queue_t *, struct request *, struct bio *, gfp_t); extern void elv_put_request(request_queue_t *, struct request *); /* From f9e3214a7964f523e12b4f30b6bd6396794818bd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:20:58 -0400 Subject: [PATCH 105/126] [PATCH] gfp_t: dma-mapping (arm) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/arm/mm/consistent.c | 8 ++++---- include/asm-arm/dma-mapping.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c index 26356ce4da54..82f4d5e27c54 100644 --- a/arch/arm/mm/consistent.c +++ b/arch/arm/mm/consistent.c @@ -75,7 +75,7 @@ static struct vm_region consistent_head = { }; static struct vm_region * -vm_region_alloc(struct vm_region *head, size_t size, int gfp) +vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp) { unsigned long addr = head->vm_start, end = head->vm_end - size; unsigned long flags; @@ -133,7 +133,7 @@ static struct vm_region *vm_region_find(struct vm_region *head, unsigned long ad #endif static void * -__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, int gfp, +__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, pgprot_t prot) { struct page *page; @@ -251,7 +251,7 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, int gfp, * virtual and bus address for that space. */ void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, int gfp) +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) { return __dma_alloc(dev, size, handle, gfp, pgprot_noncached(pgprot_kernel)); @@ -263,7 +263,7 @@ EXPORT_SYMBOL(dma_alloc_coherent); * dma_alloc_coherent above. */ void * -dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, int gfp) +dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) { return __dma_alloc(dev, size, handle, gfp, pgprot_writecombine(pgprot_kernel)); diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h index d62ade4e4cbb..e3e8541ee63b 100644 --- a/include/asm-arm/dma-mapping.h +++ b/include/asm-arm/dma-mapping.h @@ -70,7 +70,7 @@ static inline int dma_mapping_error(dma_addr_t dma_addr) * device-viewed address. */ extern void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, int gfp); +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp); /** * dma_free_coherent - free memory allocated by dma_alloc_coherent @@ -117,7 +117,7 @@ int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma, * device-viewed address. */ extern void * -dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, int gfp); +dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp); #define dma_free_writecombine(dev,size,cpu_addr,handle) \ dma_free_coherent(dev,size,cpu_addr,handle) From 06a544971fad0992fe8b92c5647538d573089dd4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:03 -0400 Subject: [PATCH 106/126] [PATCH] gfp_t: dma-mapping (ia64) ... and related annotations for amd64 - swiotlb code is shared, but prototypes are not. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/hwsw_iommu.c | 2 +- arch/ia64/hp/common/sba_iommu.c | 2 +- arch/ia64/lib/swiotlb.c | 2 +- arch/ia64/sn/pci/pci_dma.c | 2 +- include/asm-ia64/machvec.h | 2 +- include/asm-x86_64/swiotlb.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c index 80f8ef013939..1ba02baf2f94 100644 --- a/arch/ia64/hp/common/hwsw_iommu.c +++ b/arch/ia64/hp/common/hwsw_iommu.c @@ -71,7 +71,7 @@ hwsw_init (void) } void * -hwsw_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, int flags) +hwsw_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { if (use_swiotlb(dev)) return swiotlb_alloc_coherent(dev, size, dma_handle, flags); diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 11957598a8b9..21bffba78b6d 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1076,7 +1076,7 @@ void sba_unmap_single(struct device *dev, dma_addr_t iova, size_t size, int dir) * See Documentation/DMA-mapping.txt */ void * -sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, int flags) +sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { struct ioc *ioc; void *addr; diff --git a/arch/ia64/lib/swiotlb.c b/arch/ia64/lib/swiotlb.c index a604efc7f6c9..3ebbb3c8ba36 100644 --- a/arch/ia64/lib/swiotlb.c +++ b/arch/ia64/lib/swiotlb.c @@ -314,7 +314,7 @@ sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir) void * swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, int flags) + dma_addr_t *dma_handle, gfp_t flags) { unsigned long dev_addr; void *ret; diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index 0e4b9ad9ef02..75e6e874bebf 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(sn_dma_set_mask); * more information. */ void *sn_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int flags) + dma_addr_t * dma_handle, gfp_t flags) { void *cpuaddr; unsigned long phys_addr; diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h index 79e89a7db566..a2f6ac5aef7d 100644 --- a/include/asm-ia64/machvec.h +++ b/include/asm-ia64/machvec.h @@ -37,7 +37,7 @@ typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val, /* DMA-mapping interface: */ typedef void ia64_mv_dma_init (void); -typedef void *ia64_mv_dma_alloc_coherent (struct device *, size_t, dma_addr_t *, int); +typedef void *ia64_mv_dma_alloc_coherent (struct device *, size_t, dma_addr_t *, gfp_t); typedef void ia64_mv_dma_free_coherent (struct device *, size_t, void *, dma_addr_t); typedef dma_addr_t ia64_mv_dma_map_single (struct device *, void *, size_t, int); typedef void ia64_mv_dma_unmap_single (struct device *, dma_addr_t, size_t, int); diff --git a/include/asm-x86_64/swiotlb.h b/include/asm-x86_64/swiotlb.h index 36293061f4ed..7cbfd10ecc3c 100644 --- a/include/asm-x86_64/swiotlb.h +++ b/include/asm-x86_64/swiotlb.h @@ -27,7 +27,7 @@ extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction); extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr); extern void *swiotlb_alloc_coherent (struct device *hwdev, size_t size, - dma_addr_t *dma_handle, int flags); + dma_addr_t *dma_handle, gfp_t flags); extern void swiotlb_free_coherent (struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle); From 55c5d74b3ac3a6b8bdde4e5fab4015eccd557d52 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:08 -0400 Subject: [PATCH 107/126] [PATCH] gfp_t: dma-mapping (alpha) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/alpha/kernel/pci-noop.c | 2 +- include/asm-alpha/dma-mapping.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index 582a3519fb28..9903e3a79102 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -154,7 +154,7 @@ pci_dma_supported(struct pci_dev *hwdev, dma_addr_t mask) void * dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp) + dma_addr_t *dma_handle, gfp_t gfp) { void *ret; diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h index c675f282d6ad..680f7ecbb28f 100644 --- a/include/asm-alpha/dma-mapping.h +++ b/include/asm-alpha/dma-mapping.h @@ -31,7 +31,7 @@ #else /* no PCI - no IOMMU. */ void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp); + dma_addr_t *dma_handle, gfp_t gfp); int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); From 43b7eae1f5c5424a49ea4a65e9b0e9d5d2fd9446 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:13 -0400 Subject: [PATCH 108/126] [PATCH] gfp_t: dma-mapping (cris) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/asm-cris/dma-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-cris/dma-mapping.h b/include/asm-cris/dma-mapping.h index 0b5c3fdaefe1..8eff51349ae7 100644 --- a/include/asm-cris/dma-mapping.h +++ b/include/asm-cris/dma-mapping.h @@ -15,14 +15,14 @@ #ifdef CONFIG_PCI void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); + dma_addr_t *dma_handle, gfp_t flag); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); #else static inline void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - int flag) + gfp_t flag) { BUG(); return NULL; From a5da7d3c6e8fcd7aaab6c4e1e9101ba333248ffb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:18 -0400 Subject: [PATCH 109/126] [PATCH] gfp_t: dma-mapping (frv) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/frv/mb93090-mb00/pci-dma-nommu.c | 2 +- arch/frv/mb93090-mb00/pci-dma.c | 2 +- arch/frv/mm/dma-alloc.c | 2 +- include/asm-frv/dma-mapping.h | 2 +- include/asm-frv/pci.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index 819895cf0b9e..2082a9647f4f 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c @@ -33,7 +33,7 @@ struct dma_alloc_record { static DEFINE_SPINLOCK(dma_alloc_lock); static LIST_HEAD(dma_alloc_list); -void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, int gfp) +void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct dma_alloc_record *new; struct list_head *this = &dma_alloc_list; diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index 27eb12066507..86fbdadc51b6 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c @@ -17,7 +17,7 @@ #include #include -void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, int gfp) +void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { void *ret; diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index 4b38d45435f6..cfc4f97490c6 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c @@ -81,7 +81,7 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot) * portions of the kernel with single large page TLB entries, and * still get unique uncached pages for consistent DMA. */ -void *consistent_alloc(int gfp, size_t size, dma_addr_t *dma_handle) +void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle) { struct vm_struct *area; unsigned long page, va, pa; diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h index 0206ab35eae0..5003e017fd1e 100644 --- a/include/asm-frv/dma-mapping.h +++ b/include/asm-frv/dma-mapping.h @@ -13,7 +13,7 @@ extern unsigned long __nongprelbss dma_coherent_mem_start; extern unsigned long __nongprelbss dma_coherent_mem_end; -void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, int gfp); +void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); /* diff --git a/include/asm-frv/pci.h b/include/asm-frv/pci.h index b4efe5e3591a..1168451c275f 100644 --- a/include/asm-frv/pci.h +++ b/include/asm-frv/pci.h @@ -32,7 +32,7 @@ extern void pcibios_set_master(struct pci_dev *dev); extern void pcibios_penalize_isa_irq(int irq); #ifdef CONFIG_MMU -extern void *consistent_alloc(int gfp, size_t size, dma_addr_t *dma_handle); +extern void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle); extern void consistent_free(void *vaddr); extern void consistent_sync(void *vaddr, size_t size, int direction); extern void consistent_sync_page(struct page *page, unsigned long offset, From 185a8ff52875d8db31b9346ab186f75baa616dee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:23 -0400 Subject: [PATCH 110/126] [PATCH] gfp_t: dma-mapping (mips) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/mips/mm/dma-coherent.c | 4 ++-- arch/mips/mm/dma-ip27.c | 4 ++-- arch/mips/mm/dma-ip32.c | 4 ++-- arch/mips/mm/dma-noncoherent.c | 4 ++-- include/asm-mips/dma-mapping.h | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/mips/mm/dma-coherent.c b/arch/mips/mm/dma-coherent.c index 97a50d38c98f..a617f8c327e8 100644 --- a/arch/mips/mm/dma-coherent.c +++ b/arch/mips/mm/dma-coherent.c @@ -18,7 +18,7 @@ #include void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) { void *ret; /* ignore region specifiers */ @@ -39,7 +39,7 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size, EXPORT_SYMBOL(dma_alloc_noncoherent); void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) __attribute__((alias("dma_alloc_noncoherent"))); EXPORT_SYMBOL(dma_alloc_coherent); diff --git a/arch/mips/mm/dma-ip27.c b/arch/mips/mm/dma-ip27.c index aa7c94b5d781..8da19fd22ac6 100644 --- a/arch/mips/mm/dma-ip27.c +++ b/arch/mips/mm/dma-ip27.c @@ -22,7 +22,7 @@ pdev_to_baddr(to_pci_dev(dev), (addr)) void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) { void *ret; @@ -44,7 +44,7 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size, EXPORT_SYMBOL(dma_alloc_noncoherent); void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) __attribute__((alias("dma_alloc_noncoherent"))); EXPORT_SYMBOL(dma_alloc_coherent); diff --git a/arch/mips/mm/dma-ip32.c b/arch/mips/mm/dma-ip32.c index 2cbe196c35fb..a7e3072ff78d 100644 --- a/arch/mips/mm/dma-ip32.c +++ b/arch/mips/mm/dma-ip32.c @@ -37,7 +37,7 @@ #define RAM_OFFSET_MASK 0x3fffffff void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) { void *ret; /* ignore region specifiers */ @@ -61,7 +61,7 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size, EXPORT_SYMBOL(dma_alloc_noncoherent); void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) { void *ret; diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index 59e54f12212e..4ce02028a292 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -24,7 +24,7 @@ */ void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) { void *ret; /* ignore region specifiers */ @@ -45,7 +45,7 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size, EXPORT_SYMBOL(dma_alloc_noncoherent); void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, int gfp) + dma_addr_t * dma_handle, gfp_t gfp) { void *ret; diff --git a/include/asm-mips/dma-mapping.h b/include/asm-mips/dma-mapping.h index af28dc88930b..43288634c38a 100644 --- a/include/asm-mips/dma-mapping.h +++ b/include/asm-mips/dma-mapping.h @@ -5,13 +5,13 @@ #include void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); + dma_addr_t *dma_handle, gfp_t flag); void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); + dma_addr_t *dma_handle, gfp_t flag); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); From 5c1fb41f40b7b6d819a617f52dbd66b6938ef362 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:28 -0400 Subject: [PATCH 111/126] [PATCH] gfp_t: dma-mapping (parisc) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/parisc/kernel/pci-dma.c | 6 +++--- drivers/parisc/ccio-dma.c | 2 +- drivers/parisc/sba_iommu.c | 2 +- include/asm-parisc/dma-mapping.h | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 368cc095c99f..844c2877a2e3 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -349,7 +349,7 @@ pcxl_dma_init(void) __initcall(pcxl_dma_init); -static void * pa11_dma_alloc_consistent (struct device *dev, size_t size, dma_addr_t *dma_handle, int flag) +static void * pa11_dma_alloc_consistent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { unsigned long vaddr; unsigned long paddr; @@ -502,13 +502,13 @@ struct hppa_dma_ops pcxl_dma_ops = { }; static void *fail_alloc_consistent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { return NULL; } static void *pa11_dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { void *addr = NULL; diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 0e98a9d9834c..a3bd91a61827 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -836,7 +836,7 @@ ccio_unmap_single(struct device *dev, dma_addr_t iova, size_t size, * This function implements the pci_alloc_consistent function. */ static void * -ccio_alloc_consistent(struct device *dev, size_t size, dma_addr_t *dma_handle, int flag) +ccio_alloc_consistent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { void *ret; #if 0 diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 82ea68b55df4..bd8b3e5a5cd7 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -986,7 +986,7 @@ sba_unmap_single(struct device *dev, dma_addr_t iova, size_t size, * See Documentation/DMA-mapping.txt */ static void *sba_alloc_consistent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, int gfp) + dma_addr_t *dma_handle, gfp_t gfp) { void *ret; diff --git a/include/asm-parisc/dma-mapping.h b/include/asm-parisc/dma-mapping.h index 4db84f969e9e..74d4ac6f2151 100644 --- a/include/asm-parisc/dma-mapping.h +++ b/include/asm-parisc/dma-mapping.h @@ -9,8 +9,8 @@ /* See Documentation/DMA-mapping.txt */ struct hppa_dma_ops { int (*dma_supported)(struct device *dev, u64 mask); - void *(*alloc_consistent)(struct device *dev, size_t size, dma_addr_t *iova, int flag); - void *(*alloc_noncoherent)(struct device *dev, size_t size, dma_addr_t *iova, int flag); + void *(*alloc_consistent)(struct device *dev, size_t size, dma_addr_t *iova, gfp_t flag); + void *(*alloc_noncoherent)(struct device *dev, size_t size, dma_addr_t *iova, gfp_t flag); void (*free_consistent)(struct device *dev, size_t size, void *vaddr, dma_addr_t iova); dma_addr_t (*map_single)(struct device *dev, void *addr, size_t size, enum dma_data_direction direction); void (*unmap_single)(struct device *dev, dma_addr_t iova, size_t size, enum dma_data_direction direction); @@ -49,14 +49,14 @@ extern struct hppa_dma_ops *hppa_dma_ops; static inline void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - int flag) + gfp_t flag) { return hppa_dma_ops->alloc_consistent(dev, size, dma_handle, flag); } static inline void * dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - int flag) + gfp_t flag) { return hppa_dma_ops->alloc_noncoherent(dev, size, dma_handle, flag); } From e82dd4d6472304495afa271b2f63b572868b23d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:33 -0400 Subject: [PATCH 112/126] [PATCH] gfp_t: dma-mapping (ppc) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/ppc/kernel/dma-mapping.c | 4 ++-- include/asm-ppc/dma-mapping.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c index 8edee806dae7..0f710d2baec6 100644 --- a/arch/ppc/kernel/dma-mapping.c +++ b/arch/ppc/kernel/dma-mapping.c @@ -115,7 +115,7 @@ static struct vm_region consistent_head = { }; static struct vm_region * -vm_region_alloc(struct vm_region *head, size_t size, int gfp) +vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp) { unsigned long addr = head->vm_start, end = head->vm_end - size; unsigned long flags; @@ -173,7 +173,7 @@ static struct vm_region *vm_region_find(struct vm_region *head, unsigned long ad * virtual and bus address for that space. */ void * -__dma_alloc_coherent(size_t size, dma_addr_t *handle, int gfp) +__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) { struct page *page; struct vm_region *c; diff --git a/include/asm-ppc/dma-mapping.h b/include/asm-ppc/dma-mapping.h index 061bfcac1bf1..6e9635114433 100644 --- a/include/asm-ppc/dma-mapping.h +++ b/include/asm-ppc/dma-mapping.h @@ -19,7 +19,7 @@ * allocate the space "normally" and use the cache management functions * to ensure it is consistent. */ -extern void *__dma_alloc_coherent(size_t size, dma_addr_t *handle, int gfp); +extern void *__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp); extern void __dma_free_coherent(size_t size, void *vaddr); extern void __dma_sync(void *vaddr, size_t size, int direction); extern void __dma_sync_page(struct page *page, unsigned long offset, From 6dae2c2306684d9e76a04c22dc090380a9009f12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:38 -0400 Subject: [PATCH 113/126] [PATCH] gfp_t: dma-mapping (sh) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/sh/boards/renesas/rts7751r2d/mach.c | 2 +- arch/sh/cchips/voyagergx/consistent.c | 2 +- arch/sh/drivers/pci/dma-dreamcast.c | 2 +- arch/sh/mm/consistent.c | 2 +- include/asm-sh/dma-mapping.h | 4 ++-- include/asm-sh/machvec.h | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/sh/boards/renesas/rts7751r2d/mach.c b/arch/sh/boards/renesas/rts7751r2d/mach.c index 1efc18e786d5..610740512d56 100644 --- a/arch/sh/boards/renesas/rts7751r2d/mach.c +++ b/arch/sh/boards/renesas/rts7751r2d/mach.c @@ -23,7 +23,7 @@ extern void init_rts7751r2d_IRQ(void); extern void *rts7751r2d_ioremap(unsigned long, unsigned long); extern int rts7751r2d_irq_demux(int irq); -extern void *voyagergx_consistent_alloc(struct device *, size_t, dma_addr_t *, int); +extern void *voyagergx_consistent_alloc(struct device *, size_t, dma_addr_t *, gfp_t); extern int voyagergx_consistent_free(struct device *, size_t, void *, dma_addr_t); /* diff --git a/arch/sh/cchips/voyagergx/consistent.c b/arch/sh/cchips/voyagergx/consistent.c index 5b92585a38d2..3d9a02c093a3 100644 --- a/arch/sh/cchips/voyagergx/consistent.c +++ b/arch/sh/cchips/voyagergx/consistent.c @@ -31,7 +31,7 @@ static LIST_HEAD(voya_alloc_list); #define OHCI_SRAM_SIZE 0x10000 void *voyagergx_consistent_alloc(struct device *dev, size_t size, - dma_addr_t *handle, int flag) + dma_addr_t *handle, gfp_t flag) { struct list_head *list = &voya_alloc_list; struct voya_alloc_entry *entry; diff --git a/arch/sh/drivers/pci/dma-dreamcast.c b/arch/sh/drivers/pci/dma-dreamcast.c index 83de7ef4e7df..e12418bb1fa5 100644 --- a/arch/sh/drivers/pci/dma-dreamcast.c +++ b/arch/sh/drivers/pci/dma-dreamcast.c @@ -33,7 +33,7 @@ static int gapspci_dma_used = 0; void *dreamcast_consistent_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { unsigned long buf; diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c index 1f7af0c73cf4..df3a9e452cc5 100644 --- a/arch/sh/mm/consistent.c +++ b/arch/sh/mm/consistent.c @@ -11,7 +11,7 @@ #include #include -void *consistent_alloc(int gfp, size_t size, dma_addr_t *handle) +void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) { struct page *page, *end, *free; void *ret; diff --git a/include/asm-sh/dma-mapping.h b/include/asm-sh/dma-mapping.h index 80d164c1529e..d3fa5c2b889d 100644 --- a/include/asm-sh/dma-mapping.h +++ b/include/asm-sh/dma-mapping.h @@ -9,7 +9,7 @@ extern struct bus_type pci_bus_type; /* arch/sh/mm/consistent.c */ -extern void *consistent_alloc(int gfp, size_t size, dma_addr_t *handle); +extern void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle); extern void consistent_free(void *vaddr, size_t size); extern void consistent_sync(void *vaddr, size_t size, int direction); @@ -26,7 +26,7 @@ static inline int dma_set_mask(struct device *dev, u64 mask) } static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { if (sh_mv.mv_consistent_alloc) { void *ret; diff --git a/include/asm-sh/machvec.h b/include/asm-sh/machvec.h index 5771f4baa478..3f18aa180516 100644 --- a/include/asm-sh/machvec.h +++ b/include/asm-sh/machvec.h @@ -64,7 +64,7 @@ struct sh_machine_vector void (*mv_heartbeat)(void); - void *(*mv_consistent_alloc)(struct device *, size_t, dma_addr_t *, int); + void *(*mv_consistent_alloc)(struct device *, size_t, dma_addr_t *, gfp_t); int (*mv_consistent_free)(struct device *, size_t, void *, dma_addr_t); }; From f80aabb03a33702d934fbc3c02fd96471816d82e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:43 -0400 Subject: [PATCH 114/126] [PATCH] gfp_t: dma-mapping (amd64) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-gart.c | 4 ++-- arch/x86_64/kernel/pci-nommu.c | 2 +- include/asm-x86_64/dma-mapping.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index cf0a0315d586..88be97c96987 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -187,7 +187,7 @@ static void flush_gart(struct device *dev) /* Allocate DMA memory on node near device */ noinline -static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) +static void *dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { struct page *page; int node; @@ -204,7 +204,7 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) */ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned gfp) + gfp_t gfp) { void *memory; unsigned long dma_mask = 0; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 67d90b89af0b..5a981dca87ff 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -24,7 +24,7 @@ EXPORT_SYMBOL(iommu_sac_force); */ void *dma_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) + dma_addr_t *dma_handle, gfp_t gfp) { void *ret; u64 mask; diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h index e784fdc524f1..54a380efed41 100644 --- a/include/asm-x86_64/dma-mapping.h +++ b/include/asm-x86_64/dma-mapping.h @@ -17,7 +17,7 @@ extern dma_addr_t bad_dma_address; (swiotlb ? swiotlb_dma_mapping_error(x) : ((x) == bad_dma_address)) void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned gfp); + gfp_t gfp); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); From 5fb5cbed6e5ba4cbaf7284a23d42eb878bb7da24 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:48 -0400 Subject: [PATCH 115/126] [PATCH] gfp_t: dma-mapping (xtensa) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/xtensa/kernel/pci-dma.c | 2 +- include/asm-xtensa/dma-mapping.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 84fde258cf85..1ff82268e8ea 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -29,7 +29,7 @@ */ void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, int gfp) +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) { void *ret; diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h index e86a206f1209..c425f10d086a 100644 --- a/include/asm-xtensa/dma-mapping.h +++ b/include/asm-xtensa/dma-mapping.h @@ -28,7 +28,7 @@ extern void consistent_sync(void*, size_t, int); #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); + dma_addr_t *dma_handle, gfp_t flag); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); From 970a9e73f9036ef89d46b8240f99463f6d244c1d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:53 -0400 Subject: [PATCH 116/126] [PATCH] gfp_t: dma-mapping (simple cases) Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/asm-generic/dma-mapping-broken.h | 2 +- include/asm-m32r/dma-mapping.h | 2 +- include/asm-sh64/dma-mapping.h | 2 +- include/asm-sparc/dma-mapping.h | 2 +- include/asm-sparc64/dma-mapping.h | 2 +- include/asm-um/dma-mapping.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/dma-mapping-broken.h b/include/asm-generic/dma-mapping-broken.h index fd9de9502dff..a7f1a55ce6b0 100644 --- a/include/asm-generic/dma-mapping-broken.h +++ b/include/asm-generic/dma-mapping-broken.h @@ -6,7 +6,7 @@ static inline void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - int flag) + gfp_t flag) { BUG(); return NULL; diff --git a/include/asm-m32r/dma-mapping.h b/include/asm-m32r/dma-mapping.h index 3a2db28834b6..a7fa0302bda7 100644 --- a/include/asm-m32r/dma-mapping.h +++ b/include/asm-m32r/dma-mapping.h @@ -8,7 +8,7 @@ static inline void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - int flag) + gfp_t flag) { return (void *)NULL; } diff --git a/include/asm-sh64/dma-mapping.h b/include/asm-sh64/dma-mapping.h index b8d26fe677f4..cc9a2e86f5b4 100644 --- a/include/asm-sh64/dma-mapping.h +++ b/include/asm-sh64/dma-mapping.h @@ -25,7 +25,7 @@ static inline int dma_set_mask(struct device *dev, u64 mask) } static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { return consistent_alloc(NULL, size, dma_handle); } diff --git a/include/asm-sparc/dma-mapping.h b/include/asm-sparc/dma-mapping.h index 2dc5bb8effa6..d7c3b0f0a901 100644 --- a/include/asm-sparc/dma-mapping.h +++ b/include/asm-sparc/dma-mapping.h @@ -8,7 +8,7 @@ #else static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { BUG(); return NULL; diff --git a/include/asm-sparc64/dma-mapping.h b/include/asm-sparc64/dma-mapping.h index 1c5da41653a4..c7d5804ba76d 100644 --- a/include/asm-sparc64/dma-mapping.h +++ b/include/asm-sparc64/dma-mapping.h @@ -10,7 +10,7 @@ struct device; static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag) + dma_addr_t *dma_handle, gfp_t flag) { BUG(); return NULL; diff --git a/include/asm-um/dma-mapping.h b/include/asm-um/dma-mapping.h index 13e6291f7151..babd29895114 100644 --- a/include/asm-um/dma-mapping.h +++ b/include/asm-um/dma-mapping.h @@ -19,7 +19,7 @@ dma_set_mask(struct device *dev, u64 dma_mask) static inline void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - int flag) + gfp_t flag) { BUG(); return((void *) 0); From 55016f10e31bb15b85d8c500f979dfdceb37d548 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:21:58 -0400 Subject: [PATCH 117/126] [PATCH] gfp_t: drivers/usb Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/usb/core/buffer.c | 2 +- drivers/usb/core/hcd.c | 2 +- drivers/usb/core/hcd.h | 8 ++++---- drivers/usb/core/message.c | 2 +- drivers/usb/core/urb.c | 4 ++-- drivers/usb/core/usb.c | 2 +- drivers/usb/gadget/dummy_hcd.c | 8 ++++---- drivers/usb/gadget/ether.c | 22 +++++++++++----------- drivers/usb/gadget/goku_udc.c | 6 +++--- drivers/usb/gadget/lh7a40x_udc.c | 12 ++++++------ drivers/usb/gadget/net2280.c | 6 +++--- drivers/usb/gadget/omap_udc.c | 6 +++--- drivers/usb/gadget/pxa2xx_udc.c | 6 +++--- drivers/usb/gadget/serial.c | 16 ++++++++-------- drivers/usb/gadget/zero.c | 8 ++++---- drivers/usb/host/ehci-hcd.c | 2 +- drivers/usb/host/ehci-mem.c | 6 +++--- drivers/usb/host/ehci-q.c | 6 +++--- drivers/usb/host/ehci-sched.c | 14 +++++++------- drivers/usb/host/isp116x-hcd.c | 2 +- drivers/usb/host/ohci-hcd.c | 2 +- drivers/usb/host/ohci-mem.c | 4 ++-- drivers/usb/host/sl811-hcd.c | 2 +- drivers/usb/host/uhci-q.c | 2 +- drivers/usb/misc/uss720.c | 6 +++--- drivers/usb/net/asix.c | 2 +- drivers/usb/net/gl620a.c | 2 +- drivers/usb/net/kaweth.c | 6 +++--- drivers/usb/net/net1080.c | 2 +- drivers/usb/net/rndis_host.c | 2 +- drivers/usb/net/usbnet.c | 2 +- drivers/usb/net/usbnet.h | 2 +- drivers/usb/net/zaurus.c | 2 +- drivers/usb/net/zd1201.c | 2 +- include/linux/usb.h | 8 ++++---- include/linux/usb_gadget.h | 12 ++++++------ sound/usb/usbmidi.c | 2 +- 37 files changed, 100 insertions(+), 100 deletions(-) diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c index fc15b4acc8af..57e800ac3cee 100644 --- a/drivers/usb/core/buffer.c +++ b/drivers/usb/core/buffer.c @@ -106,7 +106,7 @@ void hcd_buffer_destroy (struct usb_hcd *hcd) void *hcd_buffer_alloc ( struct usb_bus *bus, size_t size, - unsigned mem_flags, + gfp_t mem_flags, dma_addr_t *dma ) { diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 1017a97a418b..ff19d64041b5 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1112,7 +1112,7 @@ static void urb_unlink (struct urb *urb) * expects usb_submit_urb() to have sanity checked and conditioned all * inputs in the urb */ -static int hcd_submit_urb (struct urb *urb, unsigned mem_flags) +static int hcd_submit_urb (struct urb *urb, gfp_t mem_flags) { int status; struct usb_hcd *hcd = urb->dev->bus->hcpriv; diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h index ac451fa7e4d2..1f1ed6211af8 100644 --- a/drivers/usb/core/hcd.h +++ b/drivers/usb/core/hcd.h @@ -142,12 +142,12 @@ struct hcd_timeout { /* timeouts we allocate */ struct usb_operations { int (*get_frame_number) (struct usb_device *usb_dev); - int (*submit_urb) (struct urb *urb, unsigned mem_flags); + int (*submit_urb) (struct urb *urb, gfp_t mem_flags); int (*unlink_urb) (struct urb *urb, int status); /* allocate dma-consistent buffer for URB_DMA_NOMAPPING */ void *(*buffer_alloc)(struct usb_bus *bus, size_t size, - unsigned mem_flags, + gfp_t mem_flags, dma_addr_t *dma); void (*buffer_free)(struct usb_bus *bus, size_t size, void *addr, dma_addr_t dma); @@ -200,7 +200,7 @@ struct hc_driver { int (*urb_enqueue) (struct usb_hcd *hcd, struct usb_host_endpoint *ep, struct urb *urb, - unsigned mem_flags); + gfp_t mem_flags); int (*urb_dequeue) (struct usb_hcd *hcd, struct urb *urb); /* hw synch, freeing endpoint resources that urb_dequeue can't */ @@ -247,7 +247,7 @@ int hcd_buffer_create (struct usb_hcd *hcd); void hcd_buffer_destroy (struct usb_hcd *hcd); void *hcd_buffer_alloc (struct usb_bus *bus, size_t size, - unsigned mem_flags, dma_addr_t *dma); + gfp_t mem_flags, dma_addr_t *dma); void hcd_buffer_free (struct usb_bus *bus, size_t size, void *addr, dma_addr_t dma); diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index f1fb67fe22a8..f9a81e84dbdf 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -321,7 +321,7 @@ int usb_sg_init ( struct scatterlist *sg, int nents, size_t length, - unsigned mem_flags + gfp_t mem_flags ) { int i; diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index c846fefb7386..b32898e0a27d 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -60,7 +60,7 @@ void usb_init_urb(struct urb *urb) * * The driver must call usb_free_urb() when it is finished with the urb. */ -struct urb *usb_alloc_urb(int iso_packets, unsigned mem_flags) +struct urb *usb_alloc_urb(int iso_packets, gfp_t mem_flags) { struct urb *urb; @@ -224,7 +224,7 @@ struct urb * usb_get_urb(struct urb *urb) * GFP_NOIO, unless b) or c) apply * */ -int usb_submit_urb(struct urb *urb, unsigned mem_flags) +int usb_submit_urb(struct urb *urb, gfp_t mem_flags) { int pipe, temp, max; struct usb_device *dev; diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 7d131509e419..4c57f3f649ed 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -1147,7 +1147,7 @@ int __usb_get_extra_descriptor(char *buffer, unsigned size, void *usb_buffer_alloc ( struct usb_device *dev, size_t size, - unsigned mem_flags, + gfp_t mem_flags, dma_addr_t *dma ) { diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c index 583db7c38cf1..8d9d8ee89554 100644 --- a/drivers/usb/gadget/dummy_hcd.c +++ b/drivers/usb/gadget/dummy_hcd.c @@ -470,7 +470,7 @@ static int dummy_disable (struct usb_ep *_ep) } static struct usb_request * -dummy_alloc_request (struct usb_ep *_ep, unsigned mem_flags) +dummy_alloc_request (struct usb_ep *_ep, gfp_t mem_flags) { struct dummy_ep *ep; struct dummy_request *req; @@ -507,7 +507,7 @@ dummy_alloc_buffer ( struct usb_ep *_ep, unsigned bytes, dma_addr_t *dma, - unsigned mem_flags + gfp_t mem_flags ) { char *retval; struct dummy_ep *ep; @@ -541,7 +541,7 @@ fifo_complete (struct usb_ep *ep, struct usb_request *req) static int dummy_queue (struct usb_ep *_ep, struct usb_request *_req, - unsigned mem_flags) + gfp_t mem_flags) { struct dummy_ep *ep; struct dummy_request *req; @@ -999,7 +999,7 @@ static int dummy_urb_enqueue ( struct usb_hcd *hcd, struct usb_host_endpoint *ep, struct urb *urb, - unsigned mem_flags + gfp_t mem_flags ) { struct dummy *dum; struct urbp *urbp; diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c index 49459e33e952..f1024e804d5c 100644 --- a/drivers/usb/gadget/ether.c +++ b/drivers/usb/gadget/ether.c @@ -945,11 +945,11 @@ config_buf (enum usb_device_speed speed, /*-------------------------------------------------------------------------*/ -static void eth_start (struct eth_dev *dev, unsigned gfp_flags); -static int alloc_requests (struct eth_dev *dev, unsigned n, unsigned gfp_flags); +static void eth_start (struct eth_dev *dev, gfp_t gfp_flags); +static int alloc_requests (struct eth_dev *dev, unsigned n, gfp_t gfp_flags); static int -set_ether_config (struct eth_dev *dev, unsigned gfp_flags) +set_ether_config (struct eth_dev *dev, gfp_t gfp_flags) { int result = 0; struct usb_gadget *gadget = dev->gadget; @@ -1081,7 +1081,7 @@ static void eth_reset_config (struct eth_dev *dev) * that returns config descriptors, and altsetting code. */ static int -eth_set_config (struct eth_dev *dev, unsigned number, unsigned gfp_flags) +eth_set_config (struct eth_dev *dev, unsigned number, gfp_t gfp_flags) { int result = 0; struct usb_gadget *gadget = dev->gadget; @@ -1598,7 +1598,7 @@ static void defer_kevent (struct eth_dev *dev, int flag) static void rx_complete (struct usb_ep *ep, struct usb_request *req); static int -rx_submit (struct eth_dev *dev, struct usb_request *req, unsigned gfp_flags) +rx_submit (struct eth_dev *dev, struct usb_request *req, gfp_t gfp_flags) { struct sk_buff *skb; int retval = -ENOMEM; @@ -1724,7 +1724,7 @@ static void rx_complete (struct usb_ep *ep, struct usb_request *req) } static int prealloc (struct list_head *list, struct usb_ep *ep, - unsigned n, unsigned gfp_flags) + unsigned n, gfp_t gfp_flags) { unsigned i; struct usb_request *req; @@ -1763,7 +1763,7 @@ static int prealloc (struct list_head *list, struct usb_ep *ep, return 0; } -static int alloc_requests (struct eth_dev *dev, unsigned n, unsigned gfp_flags) +static int alloc_requests (struct eth_dev *dev, unsigned n, gfp_t gfp_flags) { int status; @@ -1779,7 +1779,7 @@ static int alloc_requests (struct eth_dev *dev, unsigned n, unsigned gfp_flags) return status; } -static void rx_fill (struct eth_dev *dev, unsigned gfp_flags) +static void rx_fill (struct eth_dev *dev, gfp_t gfp_flags) { struct usb_request *req; unsigned long flags; @@ -1962,7 +1962,7 @@ static int eth_start_xmit (struct sk_buff *skb, struct net_device *net) * normally just one notification will be queued. */ -static struct usb_request *eth_req_alloc (struct usb_ep *, unsigned, unsigned); +static struct usb_request *eth_req_alloc (struct usb_ep *, unsigned, gfp_t); static void eth_req_free (struct usb_ep *ep, struct usb_request *req); static void @@ -2024,7 +2024,7 @@ static int rndis_control_ack (struct net_device *net) #endif /* RNDIS */ -static void eth_start (struct eth_dev *dev, unsigned gfp_flags) +static void eth_start (struct eth_dev *dev, gfp_t gfp_flags) { DEBUG (dev, "%s\n", __FUNCTION__); @@ -2092,7 +2092,7 @@ static int eth_stop (struct net_device *net) /*-------------------------------------------------------------------------*/ static struct usb_request * -eth_req_alloc (struct usb_ep *ep, unsigned size, unsigned gfp_flags) +eth_req_alloc (struct usb_ep *ep, unsigned size, gfp_t gfp_flags) { struct usb_request *req; diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c index eaab26f4ed37..b0f3cd63e3b9 100644 --- a/drivers/usb/gadget/goku_udc.c +++ b/drivers/usb/gadget/goku_udc.c @@ -269,7 +269,7 @@ static int goku_ep_disable(struct usb_ep *_ep) /*-------------------------------------------------------------------------*/ static struct usb_request * -goku_alloc_request(struct usb_ep *_ep, unsigned gfp_flags) +goku_alloc_request(struct usb_ep *_ep, gfp_t gfp_flags) { struct goku_request *req; @@ -327,7 +327,7 @@ goku_free_request(struct usb_ep *_ep, struct usb_request *_req) */ static void * goku_alloc_buffer(struct usb_ep *_ep, unsigned bytes, - dma_addr_t *dma, unsigned gfp_flags) + dma_addr_t *dma, gfp_t gfp_flags) { void *retval; struct goku_ep *ep; @@ -789,7 +789,7 @@ static void abort_dma(struct goku_ep *ep, int status) /*-------------------------------------------------------------------------*/ static int -goku_queue(struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags) +goku_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) { struct goku_request *req; struct goku_ep *ep; diff --git a/drivers/usb/gadget/lh7a40x_udc.c b/drivers/usb/gadget/lh7a40x_udc.c index 4842577789c9..012d1e5f1524 100644 --- a/drivers/usb/gadget/lh7a40x_udc.c +++ b/drivers/usb/gadget/lh7a40x_udc.c @@ -71,13 +71,13 @@ static char *state_names[] = { static int lh7a40x_ep_enable(struct usb_ep *ep, const struct usb_endpoint_descriptor *); static int lh7a40x_ep_disable(struct usb_ep *ep); -static struct usb_request *lh7a40x_alloc_request(struct usb_ep *ep, int); +static struct usb_request *lh7a40x_alloc_request(struct usb_ep *ep, gfp_t); static void lh7a40x_free_request(struct usb_ep *ep, struct usb_request *); static void *lh7a40x_alloc_buffer(struct usb_ep *ep, unsigned, dma_addr_t *, - int); + gfp_t); static void lh7a40x_free_buffer(struct usb_ep *ep, void *, dma_addr_t, unsigned); -static int lh7a40x_queue(struct usb_ep *ep, struct usb_request *, int); +static int lh7a40x_queue(struct usb_ep *ep, struct usb_request *, gfp_t); static int lh7a40x_dequeue(struct usb_ep *ep, struct usb_request *); static int lh7a40x_set_halt(struct usb_ep *ep, int); static int lh7a40x_fifo_status(struct usb_ep *ep); @@ -1106,7 +1106,7 @@ static int lh7a40x_ep_disable(struct usb_ep *_ep) } static struct usb_request *lh7a40x_alloc_request(struct usb_ep *ep, - unsigned gfp_flags) + gfp_t gfp_flags) { struct lh7a40x_request *req; @@ -1134,7 +1134,7 @@ static void lh7a40x_free_request(struct usb_ep *ep, struct usb_request *_req) } static void *lh7a40x_alloc_buffer(struct usb_ep *ep, unsigned bytes, - dma_addr_t * dma, unsigned gfp_flags) + dma_addr_t * dma, gfp_t gfp_flags) { char *retval; @@ -1158,7 +1158,7 @@ static void lh7a40x_free_buffer(struct usb_ep *ep, void *buf, dma_addr_t dma, * NOTE: Sets INDEX register */ static int lh7a40x_queue(struct usb_ep *_ep, struct usb_request *_req, - unsigned gfp_flags) + gfp_t gfp_flags) { struct lh7a40x_request *req; struct lh7a40x_ep *ep; diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c index 477fab2e74d1..c32e1f7476da 100644 --- a/drivers/usb/gadget/net2280.c +++ b/drivers/usb/gadget/net2280.c @@ -376,7 +376,7 @@ static int net2280_disable (struct usb_ep *_ep) /*-------------------------------------------------------------------------*/ static struct usb_request * -net2280_alloc_request (struct usb_ep *_ep, unsigned gfp_flags) +net2280_alloc_request (struct usb_ep *_ep, gfp_t gfp_flags) { struct net2280_ep *ep; struct net2280_request *req; @@ -463,7 +463,7 @@ net2280_alloc_buffer ( struct usb_ep *_ep, unsigned bytes, dma_addr_t *dma, - unsigned gfp_flags + gfp_t gfp_flags ) { void *retval; @@ -897,7 +897,7 @@ done (struct net2280_ep *ep, struct net2280_request *req, int status) /*-------------------------------------------------------------------------*/ static int -net2280_queue (struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags) +net2280_queue (struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) { struct net2280_request *req; struct net2280_ep *ep; diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c index ff5533e69560..287c5900fb13 100644 --- a/drivers/usb/gadget/omap_udc.c +++ b/drivers/usb/gadget/omap_udc.c @@ -269,7 +269,7 @@ static int omap_ep_disable(struct usb_ep *_ep) /*-------------------------------------------------------------------------*/ static struct usb_request * -omap_alloc_request(struct usb_ep *ep, unsigned gfp_flags) +omap_alloc_request(struct usb_ep *ep, gfp_t gfp_flags) { struct omap_req *req; @@ -298,7 +298,7 @@ omap_alloc_buffer( struct usb_ep *_ep, unsigned bytes, dma_addr_t *dma, - unsigned gfp_flags + gfp_t gfp_flags ) { void *retval; @@ -937,7 +937,7 @@ static void dma_channel_release(struct omap_ep *ep) /*-------------------------------------------------------------------------*/ static int -omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags) +omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) { struct omap_ep *ep = container_of(_ep, struct omap_ep, ep); struct omap_req *req = container_of(_req, struct omap_req, req); diff --git a/drivers/usb/gadget/pxa2xx_udc.c b/drivers/usb/gadget/pxa2xx_udc.c index 73f8c9404156..6e545393cfff 100644 --- a/drivers/usb/gadget/pxa2xx_udc.c +++ b/drivers/usb/gadget/pxa2xx_udc.c @@ -332,7 +332,7 @@ static int pxa2xx_ep_disable (struct usb_ep *_ep) * pxa2xx_ep_alloc_request - allocate a request data structure */ static struct usb_request * -pxa2xx_ep_alloc_request (struct usb_ep *_ep, unsigned gfp_flags) +pxa2xx_ep_alloc_request (struct usb_ep *_ep, gfp_t gfp_flags) { struct pxa2xx_request *req; @@ -367,7 +367,7 @@ pxa2xx_ep_free_request (struct usb_ep *_ep, struct usb_request *_req) */ static void * pxa2xx_ep_alloc_buffer(struct usb_ep *_ep, unsigned bytes, - dma_addr_t *dma, unsigned gfp_flags) + dma_addr_t *dma, gfp_t gfp_flags) { char *retval; @@ -874,7 +874,7 @@ static void dma_nodesc_handler(int dmach, void *_ep, struct pt_regs *r) /*-------------------------------------------------------------------------*/ static int -pxa2xx_ep_queue(struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags) +pxa2xx_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) { struct pxa2xx_request *req; struct pxa2xx_ep *ep; diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c index c925d9222f53..b35ac6d334f8 100644 --- a/drivers/usb/gadget/serial.c +++ b/drivers/usb/gadget/serial.c @@ -300,18 +300,18 @@ static int gs_build_config_buf(u8 *buf, enum usb_device_speed speed, u8 type, unsigned int index, int is_otg); static struct usb_request *gs_alloc_req(struct usb_ep *ep, unsigned int len, - unsigned kmalloc_flags); + gfp_t kmalloc_flags); static void gs_free_req(struct usb_ep *ep, struct usb_request *req); static struct gs_req_entry *gs_alloc_req_entry(struct usb_ep *ep, unsigned len, - unsigned kmalloc_flags); + gfp_t kmalloc_flags); static void gs_free_req_entry(struct usb_ep *ep, struct gs_req_entry *req); -static int gs_alloc_ports(struct gs_dev *dev, unsigned kmalloc_flags); +static int gs_alloc_ports(struct gs_dev *dev, gfp_t kmalloc_flags); static void gs_free_ports(struct gs_dev *dev); /* circular buffer */ -static struct gs_buf *gs_buf_alloc(unsigned int size, unsigned kmalloc_flags); +static struct gs_buf *gs_buf_alloc(unsigned int size, gfp_t kmalloc_flags); static void gs_buf_free(struct gs_buf *gb); static void gs_buf_clear(struct gs_buf *gb); static unsigned int gs_buf_data_avail(struct gs_buf *gb); @@ -2091,7 +2091,7 @@ static int gs_build_config_buf(u8 *buf, enum usb_device_speed speed, * usb_request or NULL if there is an error. */ static struct usb_request * -gs_alloc_req(struct usb_ep *ep, unsigned int len, unsigned kmalloc_flags) +gs_alloc_req(struct usb_ep *ep, unsigned int len, gfp_t kmalloc_flags) { struct usb_request *req; @@ -2132,7 +2132,7 @@ static void gs_free_req(struct usb_ep *ep, struct usb_request *req) * endpoint, buffer len, and kmalloc flags. */ static struct gs_req_entry * -gs_alloc_req_entry(struct usb_ep *ep, unsigned len, unsigned kmalloc_flags) +gs_alloc_req_entry(struct usb_ep *ep, unsigned len, gfp_t kmalloc_flags) { struct gs_req_entry *req; @@ -2173,7 +2173,7 @@ static void gs_free_req_entry(struct usb_ep *ep, struct gs_req_entry *req) * * The device lock is normally held when calling this function. */ -static int gs_alloc_ports(struct gs_dev *dev, unsigned kmalloc_flags) +static int gs_alloc_ports(struct gs_dev *dev, gfp_t kmalloc_flags) { int i; struct gs_port *port; @@ -2255,7 +2255,7 @@ static void gs_free_ports(struct gs_dev *dev) * * Allocate a circular buffer and all associated memory. */ -static struct gs_buf *gs_buf_alloc(unsigned int size, unsigned kmalloc_flags) +static struct gs_buf *gs_buf_alloc(unsigned int size, gfp_t kmalloc_flags) { struct gs_buf *gb; diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c index 6890e773b2a2..ec9c424f1d97 100644 --- a/drivers/usb/gadget/zero.c +++ b/drivers/usb/gadget/zero.c @@ -612,7 +612,7 @@ static void source_sink_complete (struct usb_ep *ep, struct usb_request *req) } static struct usb_request * -source_sink_start_ep (struct usb_ep *ep, unsigned gfp_flags) +source_sink_start_ep (struct usb_ep *ep, gfp_t gfp_flags) { struct usb_request *req; int status; @@ -640,7 +640,7 @@ source_sink_start_ep (struct usb_ep *ep, unsigned gfp_flags) } static int -set_source_sink_config (struct zero_dev *dev, unsigned gfp_flags) +set_source_sink_config (struct zero_dev *dev, gfp_t gfp_flags) { int result = 0; struct usb_ep *ep; @@ -744,7 +744,7 @@ static void loopback_complete (struct usb_ep *ep, struct usb_request *req) } static int -set_loopback_config (struct zero_dev *dev, unsigned gfp_flags) +set_loopback_config (struct zero_dev *dev, gfp_t gfp_flags) { int result = 0; struct usb_ep *ep; @@ -845,7 +845,7 @@ static void zero_reset_config (struct zero_dev *dev) * by limiting configuration choices (like the pxa2xx). */ static int -zero_set_config (struct zero_dev *dev, unsigned number, unsigned gfp_flags) +zero_set_config (struct zero_dev *dev, unsigned number, gfp_t gfp_flags) { int result = 0; struct usb_gadget *gadget = dev->gadget; diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index b948ffd94f45..f5eb9e7b5b18 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -983,7 +983,7 @@ static int ehci_urb_enqueue ( struct usb_hcd *hcd, struct usb_host_endpoint *ep, struct urb *urb, - unsigned mem_flags + gfp_t mem_flags ) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); struct list_head qtd_list; diff --git a/drivers/usb/host/ehci-mem.c b/drivers/usb/host/ehci-mem.c index 5c38ad869485..91c2ab43cbcc 100644 --- a/drivers/usb/host/ehci-mem.c +++ b/drivers/usb/host/ehci-mem.c @@ -45,7 +45,7 @@ static inline void ehci_qtd_init (struct ehci_qtd *qtd, dma_addr_t dma) INIT_LIST_HEAD (&qtd->qtd_list); } -static struct ehci_qtd *ehci_qtd_alloc (struct ehci_hcd *ehci, int flags) +static struct ehci_qtd *ehci_qtd_alloc (struct ehci_hcd *ehci, gfp_t flags) { struct ehci_qtd *qtd; dma_addr_t dma; @@ -79,7 +79,7 @@ static void qh_destroy (struct kref *kref) dma_pool_free (ehci->qh_pool, qh, qh->qh_dma); } -static struct ehci_qh *ehci_qh_alloc (struct ehci_hcd *ehci, int flags) +static struct ehci_qh *ehci_qh_alloc (struct ehci_hcd *ehci, gfp_t flags) { struct ehci_qh *qh; dma_addr_t dma; @@ -161,7 +161,7 @@ static void ehci_mem_cleanup (struct ehci_hcd *ehci) } /* remember to add cleanup code (above) if you add anything here */ -static int ehci_mem_init (struct ehci_hcd *ehci, int flags) +static int ehci_mem_init (struct ehci_hcd *ehci, gfp_t flags) { int i; diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c index 940d38ca7d91..5bb872c3496d 100644 --- a/drivers/usb/host/ehci-q.c +++ b/drivers/usb/host/ehci-q.c @@ -477,7 +477,7 @@ qh_urb_transaction ( struct ehci_hcd *ehci, struct urb *urb, struct list_head *head, - int flags + gfp_t flags ) { struct ehci_qtd *qtd, *qtd_prev; dma_addr_t buf; @@ -629,7 +629,7 @@ static struct ehci_qh * qh_make ( struct ehci_hcd *ehci, struct urb *urb, - int flags + gfp_t flags ) { struct ehci_qh *qh = ehci_qh_alloc (ehci, flags); u32 info1 = 0, info2 = 0; @@ -906,7 +906,7 @@ submit_async ( struct usb_host_endpoint *ep, struct urb *urb, struct list_head *qtd_list, - unsigned mem_flags + gfp_t mem_flags ) { struct ehci_qtd *qtd; int epnum; diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c index ccc7300baa6d..f0c8aa1ccd5d 100644 --- a/drivers/usb/host/ehci-sched.c +++ b/drivers/usb/host/ehci-sched.c @@ -589,7 +589,7 @@ static int intr_submit ( struct usb_host_endpoint *ep, struct urb *urb, struct list_head *qtd_list, - unsigned mem_flags + gfp_t mem_flags ) { unsigned epnum; unsigned long flags; @@ -634,7 +634,7 @@ static int intr_submit ( /* ehci_iso_stream ops work with both ITD and SITD */ static struct ehci_iso_stream * -iso_stream_alloc (unsigned mem_flags) +iso_stream_alloc (gfp_t mem_flags) { struct ehci_iso_stream *stream; @@ -851,7 +851,7 @@ iso_stream_find (struct ehci_hcd *ehci, struct urb *urb) /* ehci_iso_sched ops can be ITD-only or SITD-only */ static struct ehci_iso_sched * -iso_sched_alloc (unsigned packets, unsigned mem_flags) +iso_sched_alloc (unsigned packets, gfp_t mem_flags) { struct ehci_iso_sched *iso_sched; int size = sizeof *iso_sched; @@ -924,7 +924,7 @@ itd_urb_transaction ( struct ehci_iso_stream *stream, struct ehci_hcd *ehci, struct urb *urb, - unsigned mem_flags + gfp_t mem_flags ) { struct ehci_itd *itd; @@ -1418,7 +1418,7 @@ itd_complete ( /*-------------------------------------------------------------------------*/ static int itd_submit (struct ehci_hcd *ehci, struct urb *urb, - unsigned mem_flags) + gfp_t mem_flags) { int status = -EINVAL; unsigned long flags; @@ -1529,7 +1529,7 @@ sitd_urb_transaction ( struct ehci_iso_stream *stream, struct ehci_hcd *ehci, struct urb *urb, - unsigned mem_flags + gfp_t mem_flags ) { struct ehci_sitd *sitd; @@ -1779,7 +1779,7 @@ sitd_complete ( static int sitd_submit (struct ehci_hcd *ehci, struct urb *urb, - unsigned mem_flags) + gfp_t mem_flags) { int status = -EINVAL; unsigned long flags; diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c index e142056b0d2c..2548d94fcd72 100644 --- a/drivers/usb/host/isp116x-hcd.c +++ b/drivers/usb/host/isp116x-hcd.c @@ -694,7 +694,7 @@ static int balance(struct isp116x *isp116x, u16 period, u16 load) static int isp116x_urb_enqueue(struct usb_hcd *hcd, struct usb_host_endpoint *hep, struct urb *urb, - unsigned mem_flags) + gfp_t mem_flags) { struct isp116x *isp116x = hcd_to_isp116x(hcd); struct usb_device *udev = urb->dev; diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c index 67c1aa5eb1c1..f8da8c7af7c6 100644 --- a/drivers/usb/host/ohci-hcd.c +++ b/drivers/usb/host/ohci-hcd.c @@ -180,7 +180,7 @@ static int ohci_urb_enqueue ( struct usb_hcd *hcd, struct usb_host_endpoint *ep, struct urb *urb, - unsigned mem_flags + gfp_t mem_flags ) { struct ohci_hcd *ohci = hcd_to_ohci (hcd); struct ed *ed; diff --git a/drivers/usb/host/ohci-mem.c b/drivers/usb/host/ohci-mem.c index fd3c4d3714bd..9fb83dfb1eb4 100644 --- a/drivers/usb/host/ohci-mem.c +++ b/drivers/usb/host/ohci-mem.c @@ -84,7 +84,7 @@ dma_to_td (struct ohci_hcd *hc, dma_addr_t td_dma) /* TDs ... */ static struct td * -td_alloc (struct ohci_hcd *hc, unsigned mem_flags) +td_alloc (struct ohci_hcd *hc, gfp_t mem_flags) { dma_addr_t dma; struct td *td; @@ -118,7 +118,7 @@ td_free (struct ohci_hcd *hc, struct td *td) /* EDs ... */ static struct ed * -ed_alloc (struct ohci_hcd *hc, unsigned mem_flags) +ed_alloc (struct ohci_hcd *hc, gfp_t mem_flags) { dma_addr_t dma; struct ed *ed; diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c index d42a15d10a46..cad858575cea 100644 --- a/drivers/usb/host/sl811-hcd.c +++ b/drivers/usb/host/sl811-hcd.c @@ -818,7 +818,7 @@ static int sl811h_urb_enqueue( struct usb_hcd *hcd, struct usb_host_endpoint *hep, struct urb *urb, - unsigned mem_flags + gfp_t mem_flags ) { struct sl811 *sl811 = hcd_to_sl811(hcd); struct usb_device *udev = urb->dev; diff --git a/drivers/usb/host/uhci-q.c b/drivers/usb/host/uhci-q.c index ea0d168a8c67..4e0fbe2c1a9a 100644 --- a/drivers/usb/host/uhci-q.c +++ b/drivers/usb/host/uhci-q.c @@ -1164,7 +1164,7 @@ static struct urb *uhci_find_urb_ep(struct uhci_hcd *uhci, struct urb *urb) static int uhci_urb_enqueue(struct usb_hcd *hcd, struct usb_host_endpoint *ep, - struct urb *urb, unsigned mem_flags) + struct urb *urb, gfp_t mem_flags) { int ret; struct uhci_hcd *uhci = hcd_to_uhci(hcd); diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c index 03fb70ef2eb3..0592cb5e6c4d 100644 --- a/drivers/usb/misc/uss720.c +++ b/drivers/usb/misc/uss720.c @@ -137,7 +137,7 @@ static void async_complete(struct urb *urb, struct pt_regs *ptregs) static struct uss720_async_request *submit_async_request(struct parport_uss720_private *priv, __u8 request, __u8 requesttype, __u16 value, __u16 index, - unsigned int mem_flags) + gfp_t mem_flags) { struct usb_device *usbdev; struct uss720_async_request *rq; @@ -204,7 +204,7 @@ static unsigned int kill_all_async_requests_priv(struct parport_uss720_private * /* --------------------------------------------------------------------- */ -static int get_1284_register(struct parport *pp, unsigned char reg, unsigned char *val, unsigned int mem_flags) +static int get_1284_register(struct parport *pp, unsigned char reg, unsigned char *val, gfp_t mem_flags) { struct parport_uss720_private *priv; struct uss720_async_request *rq; @@ -238,7 +238,7 @@ static int get_1284_register(struct parport *pp, unsigned char reg, unsigned cha return -EIO; } -static int set_1284_register(struct parport *pp, unsigned char reg, unsigned char val, unsigned int mem_flags) +static int set_1284_register(struct parport *pp, unsigned char reg, unsigned char val, gfp_t mem_flags) { struct parport_uss720_private *priv; struct uss720_async_request *rq; diff --git a/drivers/usb/net/asix.c b/drivers/usb/net/asix.c index 861f00a43750..252a34fbb42c 100644 --- a/drivers/usb/net/asix.c +++ b/drivers/usb/net/asix.c @@ -753,7 +753,7 @@ static int ax88772_rx_fixup(struct usbnet *dev, struct sk_buff *skb) } static struct sk_buff *ax88772_tx_fixup(struct usbnet *dev, struct sk_buff *skb, - unsigned flags) + gfp_t flags) { int padlen; int headroom = skb_headroom(skb); diff --git a/drivers/usb/net/gl620a.c b/drivers/usb/net/gl620a.c index c8763ae33c73..c0f263b202a6 100644 --- a/drivers/usb/net/gl620a.c +++ b/drivers/usb/net/gl620a.c @@ -301,7 +301,7 @@ static int genelink_rx_fixup(struct usbnet *dev, struct sk_buff *skb) } static struct sk_buff * -genelink_tx_fixup(struct usbnet *dev, struct sk_buff *skb, unsigned flags) +genelink_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; int length = skb->len; diff --git a/drivers/usb/net/kaweth.c b/drivers/usb/net/kaweth.c index e04b0ce3611a..c82655d3d448 100644 --- a/drivers/usb/net/kaweth.c +++ b/drivers/usb/net/kaweth.c @@ -477,13 +477,13 @@ static int kaweth_reset(struct kaweth_device *kaweth) } static void kaweth_usb_receive(struct urb *, struct pt_regs *regs); -static int kaweth_resubmit_rx_urb(struct kaweth_device *, unsigned); +static int kaweth_resubmit_rx_urb(struct kaweth_device *, gfp_t); /**************************************************************** int_callback *****************************************************************/ -static void kaweth_resubmit_int_urb(struct kaweth_device *kaweth, int mf) +static void kaweth_resubmit_int_urb(struct kaweth_device *kaweth, gfp_t mf) { int status; @@ -550,7 +550,7 @@ static void kaweth_resubmit_tl(void *d) * kaweth_resubmit_rx_urb ****************************************************************/ static int kaweth_resubmit_rx_urb(struct kaweth_device *kaweth, - unsigned mem_flags) + gfp_t mem_flags) { int result; diff --git a/drivers/usb/net/net1080.c b/drivers/usb/net/net1080.c index a4309c4a491b..cee55f8cf64f 100644 --- a/drivers/usb/net/net1080.c +++ b/drivers/usb/net/net1080.c @@ -500,7 +500,7 @@ static int net1080_rx_fixup(struct usbnet *dev, struct sk_buff *skb) } static struct sk_buff * -net1080_tx_fixup(struct usbnet *dev, struct sk_buff *skb, unsigned flags) +net1080_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; struct sk_buff *skb2; diff --git a/drivers/usb/net/rndis_host.c b/drivers/usb/net/rndis_host.c index 2ed2e5fb7778..b5a925dc1beb 100644 --- a/drivers/usb/net/rndis_host.c +++ b/drivers/usb/net/rndis_host.c @@ -517,7 +517,7 @@ static int rndis_rx_fixup(struct usbnet *dev, struct sk_buff *skb) } static struct sk_buff * -rndis_tx_fixup(struct usbnet *dev, struct sk_buff *skb, unsigned flags) +rndis_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { struct rndis_data_hdr *hdr; struct sk_buff *skb2; diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c index 6c460918d54f..fce81d738933 100644 --- a/drivers/usb/net/usbnet.c +++ b/drivers/usb/net/usbnet.c @@ -288,7 +288,7 @@ EXPORT_SYMBOL_GPL(usbnet_defer_kevent); static void rx_complete (struct urb *urb, struct pt_regs *regs); -static void rx_submit (struct usbnet *dev, struct urb *urb, unsigned flags) +static void rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags) { struct sk_buff *skb; struct skb_data *entry; diff --git a/drivers/usb/net/usbnet.h b/drivers/usb/net/usbnet.h index 7aa0abd1a9bd..89fc4958eecf 100644 --- a/drivers/usb/net/usbnet.h +++ b/drivers/usb/net/usbnet.h @@ -107,7 +107,7 @@ struct driver_info { /* fixup tx packet (add framing) */ struct sk_buff *(*tx_fixup)(struct usbnet *dev, - struct sk_buff *skb, unsigned flags); + struct sk_buff *skb, gfp_t flags); /* for new devices, use the descriptor-reading code instead */ int in; /* rx endpoint */ diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c index ee3b892aeabc..5d4b7d55b097 100644 --- a/drivers/usb/net/zaurus.c +++ b/drivers/usb/net/zaurus.c @@ -62,7 +62,7 @@ */ static struct sk_buff * -zaurus_tx_fixup(struct usbnet *dev, struct sk_buff *skb, unsigned flags) +zaurus_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; struct sk_buff *skb2; diff --git a/drivers/usb/net/zd1201.c b/drivers/usb/net/zd1201.c index c4e479ee926a..2f52261c7cc1 100644 --- a/drivers/usb/net/zd1201.c +++ b/drivers/usb/net/zd1201.c @@ -521,7 +521,7 @@ static int zd1201_setconfig(struct zd1201 *zd, int rid, void *buf, int len, int int reqlen; char seq=0; struct urb *urb; - unsigned int gfp_mask = wait ? GFP_NOIO : GFP_ATOMIC; + gfp_t gfp_mask = wait ? GFP_NOIO : GFP_ATOMIC; len += 4; /* first 4 are for header */ diff --git a/include/linux/usb.h b/include/linux/usb.h index 4dbe580f9335..8f731e8f2821 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -933,17 +933,17 @@ static inline void usb_fill_int_urb (struct urb *urb, } extern void usb_init_urb(struct urb *urb); -extern struct urb *usb_alloc_urb(int iso_packets, unsigned mem_flags); +extern struct urb *usb_alloc_urb(int iso_packets, gfp_t mem_flags); extern void usb_free_urb(struct urb *urb); #define usb_put_urb usb_free_urb extern struct urb *usb_get_urb(struct urb *urb); -extern int usb_submit_urb(struct urb *urb, unsigned mem_flags); +extern int usb_submit_urb(struct urb *urb, gfp_t mem_flags); extern int usb_unlink_urb(struct urb *urb); extern void usb_kill_urb(struct urb *urb); #define HAVE_USB_BUFFERS void *usb_buffer_alloc (struct usb_device *dev, size_t size, - unsigned mem_flags, dma_addr_t *dma); + gfp_t mem_flags, dma_addr_t *dma); void usb_buffer_free (struct usb_device *dev, size_t size, void *addr, dma_addr_t dma); @@ -1050,7 +1050,7 @@ int usb_sg_init ( struct scatterlist *sg, int nents, size_t length, - unsigned mem_flags + gfp_t mem_flags ); void usb_sg_cancel (struct usb_sg_request *io); void usb_sg_wait (struct usb_sg_request *io); diff --git a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h index 71e608607324..ff81117eb733 100644 --- a/include/linux/usb_gadget.h +++ b/include/linux/usb_gadget.h @@ -107,18 +107,18 @@ struct usb_ep_ops { int (*disable) (struct usb_ep *ep); struct usb_request *(*alloc_request) (struct usb_ep *ep, - unsigned gfp_flags); + gfp_t gfp_flags); void (*free_request) (struct usb_ep *ep, struct usb_request *req); void *(*alloc_buffer) (struct usb_ep *ep, unsigned bytes, - dma_addr_t *dma, unsigned gfp_flags); + dma_addr_t *dma, gfp_t gfp_flags); void (*free_buffer) (struct usb_ep *ep, void *buf, dma_addr_t dma, unsigned bytes); // NOTE: on 2.6, drivers may also use dma_map() and // dma_sync_single_*() to directly manage dma overhead. int (*queue) (struct usb_ep *ep, struct usb_request *req, - unsigned gfp_flags); + gfp_t gfp_flags); int (*dequeue) (struct usb_ep *ep, struct usb_request *req); int (*set_halt) (struct usb_ep *ep, int value); @@ -214,7 +214,7 @@ usb_ep_disable (struct usb_ep *ep) * Returns the request, or null if one could not be allocated. */ static inline struct usb_request * -usb_ep_alloc_request (struct usb_ep *ep, unsigned gfp_flags) +usb_ep_alloc_request (struct usb_ep *ep, gfp_t gfp_flags) { return ep->ops->alloc_request (ep, gfp_flags); } @@ -254,7 +254,7 @@ usb_ep_free_request (struct usb_ep *ep, struct usb_request *req) */ static inline void * usb_ep_alloc_buffer (struct usb_ep *ep, unsigned len, dma_addr_t *dma, - unsigned gfp_flags) + gfp_t gfp_flags) { return ep->ops->alloc_buffer (ep, len, dma, gfp_flags); } @@ -330,7 +330,7 @@ usb_ep_free_buffer (struct usb_ep *ep, void *buf, dma_addr_t dma, unsigned len) * reported when the usb peripheral is disconnected. */ static inline int -usb_ep_queue (struct usb_ep *ep, struct usb_request *req, unsigned gfp_flags) +usb_ep_queue (struct usb_ep *ep, struct usb_request *req, gfp_t gfp_flags) { return ep->ops->queue (ep, req, gfp_flags); } diff --git a/sound/usb/usbmidi.c b/sound/usb/usbmidi.c index e0d0365453b3..f1a2e2c2e02f 100644 --- a/sound/usb/usbmidi.c +++ b/sound/usb/usbmidi.c @@ -163,7 +163,7 @@ static const uint8_t snd_usbmidi_cin_length[] = { /* * Submits the URB, with error handling. */ -static int snd_usbmidi_submit_urb(struct urb* urb, int flags) +static int snd_usbmidi_submit_urb(struct urb* urb, gfp_t flags) { int err = usb_submit_urb(urb, flags); if (err < 0 && err != -ENODEV) From 9796fdd829da626374458e8706daedcc0e432ddd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:03 -0400 Subject: [PATCH 118/126] [PATCH] gfp_t: kernel/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/audit.h | 4 ++-- include/linux/suspend.h | 2 +- kernel/audit.c | 6 +++--- kernel/auditsc.c | 2 +- kernel/kexec.c | 7 +++---- kernel/power/swsusp.c | 2 +- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index b2a2509bd7ea..da3c01955f3d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -260,11 +260,11 @@ extern int audit_filter_user(struct netlink_skb_parms *cb, int type); #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ -extern void audit_log(struct audit_context *ctx, int gfp_mask, +extern void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) __attribute__((format(printf,4,5))); -extern struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, int type); +extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) __attribute__((format(printf,2,3))); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index ad15a54806d8..ba448c760168 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -71,7 +71,7 @@ void restore_processor_state(void); struct saved_context; void __save_processor_state(struct saved_context *ctxt); void __restore_processor_state(struct saved_context *ctxt); -extern unsigned long get_usable_page(unsigned gfp_mask); +extern unsigned long get_usable_page(gfp_t gfp_mask); extern void free_eaten_memory(void); #endif /* _LINUX_SWSUSP_H */ diff --git a/kernel/audit.c b/kernel/audit.c index aefa73a8a586..0c56320d38dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -133,7 +133,7 @@ struct audit_buffer { struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ - int gfp_mask; + gfp_t gfp_mask; }; static void audit_set_pid(struct audit_buffer *ab, pid_t pid) @@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, +struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab = NULL; @@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab) /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, int gfp_mask, int type, +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 88696f639aab..d8a68509e729 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab) up_read(&mm->mmap_sem); } -static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) +static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) { int i; struct audit_buffer *ab; diff --git a/kernel/kexec.c b/kernel/kexec.c index cdd4dcd8fb63..36c5d9cd4cc1 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p) static int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); static struct page *kimage_alloc_page(struct kimage *image, - unsigned int gfp_mask, + gfp_t gfp_mask, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, @@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image, return 0; } -static struct page *kimage_alloc_pages(unsigned int gfp_mask, - unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; @@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image, } static struct page *kimage_alloc_page(struct kimage *image, - unsigned int gfp_mask, + gfp_t gfp_mask, unsigned long destination) { /* diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 2d5c45676442..10bc5ec496d7 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -1095,7 +1095,7 @@ static inline void eat_page(void *page) *eaten_memory = c; } -unsigned long get_usable_page(unsigned gfp_mask) +unsigned long get_usable_page(gfp_t gfp_mask) { unsigned long m; From c53033f6b0bd7cc133b7f433083f0394cf29ac70 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:08 -0400 Subject: [PATCH 119/126] [PATCH] gfp_t: drivers/scsi Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/scsi/eata.c | 2 +- drivers/scsi/hosts.c | 3 ++- drivers/scsi/lpfc/lpfc_mem.c | 2 +- drivers/scsi/osst.c | 6 ++++-- drivers/scsi/qla2xxx/qla_gbl.h | 4 ++-- drivers/scsi/qla2xxx/qla_init.c | 2 +- drivers/scsi/qla2xxx/qla_rscn.c | 2 +- drivers/scsi/scsi.c | 8 ++++---- drivers/scsi/scsi_ioctl.c | 3 ++- drivers/scsi/scsi_lib.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/scsi/st.c | 6 ++++-- include/scsi/scsi_cmnd.h | 2 +- include/scsi/scsi_request.h | 2 +- 14 files changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c index c10e45b94b62..3d13fdee4fc2 100644 --- a/drivers/scsi/eata.c +++ b/drivers/scsi/eata.c @@ -1357,7 +1357,7 @@ static int port_detect(unsigned long port_base, unsigned int j, for (i = 0; i < shost->can_queue; i++) { size_t sz = shost->sg_tablesize *sizeof(struct sg_list); - unsigned int gfp_mask = (shost->unchecked_isa_dma ? GFP_DMA : 0) | GFP_ATOMIC; + gfp_t gfp_mask = (shost->unchecked_isa_dma ? GFP_DMA : 0) | GFP_ATOMIC; ha->cp[i].sglist = kmalloc(sz, gfp_mask); if (!ha->cp[i].sglist) { printk diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 02fe371b0ab8..f24d84538fd5 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -287,7 +287,8 @@ static void scsi_host_dev_release(struct device *dev) struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) { struct Scsi_Host *shost; - int gfp_mask = GFP_KERNEL, rval; + gfp_t gfp_mask = GFP_KERNEL; + int rval; if (sht->unchecked_isa_dma && privsize) gfp_mask |= __GFP_DMA; diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index 0aba13ceaacf..352df47bcaca 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -39,7 +39,7 @@ #define LPFC_MEM_POOL_SIZE 64 /* max elem in non-DMA safety pool */ static void * -lpfc_pool_kmalloc(unsigned int gfp_flags, void *data) +lpfc_pool_kmalloc(gfp_t gfp_flags, void *data) { return kmalloc((unsigned long)data, gfp_flags); } diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 3f2f2464fa63..af1133104b3f 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -5146,7 +5146,8 @@ static long osst_compat_ioctl(struct file * file, unsigned int cmd_in, unsigned /* Try to allocate a new tape buffer skeleton. Caller must not hold os_scsi_tapes_lock */ static struct osst_buffer * new_tape_buffer( int from_initialization, int need_dma, int max_sg ) { - int i, priority; + int i; + gfp_t priority; struct osst_buffer *tb; if (from_initialization) @@ -5178,7 +5179,8 @@ static struct osst_buffer * new_tape_buffer( int from_initialization, int need_d /* Try to allocate a temporary (while a user has the device open) enlarged tape buffer */ static int enlarge_buffer(struct osst_buffer *STbuffer, int need_dma) { - int segs, nbr, max_segs, b_size, priority, order, got; + int segs, nbr, max_segs, b_size, order, got; + gfp_t priority; if (STbuffer->buffer_size >= OS_FRAME_SIZE) return 1; diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index 1ed32e7b5472..e451941ad81d 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -52,7 +52,7 @@ extern int qla2x00_load_risc(struct scsi_qla_host *, uint32_t *); extern int qla24xx_load_risc_flash(scsi_qla_host_t *, uint32_t *); extern int qla24xx_load_risc_hotplug(scsi_qla_host_t *, uint32_t *); -extern fc_port_t *qla2x00_alloc_fcport(scsi_qla_host_t *, int); +extern fc_port_t *qla2x00_alloc_fcport(scsi_qla_host_t *, gfp_t); extern int qla2x00_loop_resync(scsi_qla_host_t *); @@ -277,7 +277,7 @@ extern int qla2x00_fdmi_register(scsi_qla_host_t *); /* * Global Function Prototypes in qla_rscn.c source file. */ -extern fc_port_t *qla2x00_alloc_rscn_fcport(scsi_qla_host_t *, int); +extern fc_port_t *qla2x00_alloc_rscn_fcport(scsi_qla_host_t *, gfp_t); extern int qla2x00_handle_port_rscn(scsi_qla_host_t *, uint32_t, fc_port_t *, int); extern void qla2x00_process_iodesc(scsi_qla_host_t *, struct mbx_entry *); diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 23d095d3817b..fbb6feee40cf 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -1685,7 +1685,7 @@ qla2x00_nvram_config(scsi_qla_host_t *ha) * Returns a pointer to the allocated fcport, or NULL, if none available. */ fc_port_t * -qla2x00_alloc_fcport(scsi_qla_host_t *ha, int flags) +qla2x00_alloc_fcport(scsi_qla_host_t *ha, gfp_t flags) { fc_port_t *fcport; diff --git a/drivers/scsi/qla2xxx/qla_rscn.c b/drivers/scsi/qla2xxx/qla_rscn.c index 1eba98828636..7534efcc8918 100644 --- a/drivers/scsi/qla2xxx/qla_rscn.c +++ b/drivers/scsi/qla2xxx/qla_rscn.c @@ -1066,7 +1066,7 @@ qla2x00_send_login_iocb_cb(scsi_qla_host_t *ha, struct io_descriptor *iodesc, * Returns a pointer to the allocated RSCN fcport, or NULL, if none available. */ fc_port_t * -qla2x00_alloc_rscn_fcport(scsi_qla_host_t *ha, int flags) +qla2x00_alloc_rscn_fcport(scsi_qla_host_t *ha, gfp_t flags) { fc_port_t *fcport; diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1f0ebabf6d47..a5711d545d71 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -130,7 +130,7 @@ EXPORT_SYMBOL(scsi_device_types); * Returns: Pointer to request block. */ struct scsi_request *scsi_allocate_request(struct scsi_device *sdev, - int gfp_mask) + gfp_t gfp_mask) { const int offset = ALIGN(sizeof(struct scsi_request), 4); const int size = offset + sizeof(struct request); @@ -196,7 +196,7 @@ struct scsi_host_cmd_pool { unsigned int users; char *name; unsigned int slab_flags; - unsigned int gfp_mask; + gfp_t gfp_mask; }; static struct scsi_host_cmd_pool scsi_cmd_pool = { @@ -213,7 +213,7 @@ static struct scsi_host_cmd_pool scsi_cmd_dma_pool = { static DECLARE_MUTEX(host_cmd_pool_mutex); static struct scsi_cmnd *__scsi_get_command(struct Scsi_Host *shost, - int gfp_mask) + gfp_t gfp_mask) { struct scsi_cmnd *cmd; @@ -245,7 +245,7 @@ static struct scsi_cmnd *__scsi_get_command(struct Scsi_Host *shost, * * Returns: The allocated scsi command structure. */ -struct scsi_cmnd *scsi_get_command(struct scsi_device *dev, int gfp_mask) +struct scsi_cmnd *scsi_get_command(struct scsi_device *dev, gfp_t gfp_mask) { struct scsi_cmnd *cmd; diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index de7f98cc38fe..6a3f6aae8a97 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -205,7 +205,8 @@ int scsi_ioctl_send_command(struct scsi_device *sdev, unsigned int inlen, outlen, cmdlen; unsigned int needed, buf_needed; int timeout, retries, result; - int data_direction, gfp_mask = GFP_KERNEL; + int data_direction; + gfp_t gfp_mask = GFP_KERNEL; if (!sic) return -EINVAL; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0074f28c37b2..3ff538809786 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -677,7 +677,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate, return NULL; } -static struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, int gfp_mask) +static struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) { struct scsi_host_sg_pool *sgp; struct scatterlist *sgl; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ad94367df430..fd56b7ec88b6 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2644,7 +2644,7 @@ static char * sg_page_malloc(int rqSz, int lowDma, int *retSzp) { char *resp = NULL; - int page_mask; + gfp_t page_mask; int order, a_size; int resSz = rqSz; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index d001c046551b..927d700f0073 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3577,7 +3577,8 @@ static long st_compat_ioctl(struct file *file, unsigned int cmd, unsigned long a static struct st_buffer * new_tape_buffer(int from_initialization, int need_dma, int max_sg) { - int i, priority, got = 0, segs = 0; + int i, got = 0, segs = 0; + gfp_t priority; struct st_buffer *tb; if (from_initialization) @@ -3610,7 +3611,8 @@ static struct st_buffer * /* Try to allocate enough space in the tape buffer */ static int enlarge_buffer(struct st_buffer * STbuffer, int new_size, int need_dma) { - int segs, nbr, max_segs, b_size, priority, order, got; + int segs, nbr, max_segs, b_size, order, got; + gfp_t priority; if (new_size <= STbuffer->buffer_size) return 1; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index bed4b7c9be99..e6b61fab66dd 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -146,7 +146,7 @@ struct scsi_cmnd { #define SCSI_STATE_MLQUEUE 0x100b -extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, int); +extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, gfp_t); extern void scsi_put_command(struct scsi_cmnd *); extern void scsi_io_completion(struct scsi_cmnd *, unsigned int, unsigned int); extern void scsi_finish_command(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h index 6a140020d7cb..2539debb7993 100644 --- a/include/scsi/scsi_request.h +++ b/include/scsi/scsi_request.h @@ -45,7 +45,7 @@ struct scsi_request { level driver) of this request */ }; -extern struct scsi_request *scsi_allocate_request(struct scsi_device *, int); +extern struct scsi_request *scsi_allocate_request(struct scsi_device *, gfp_t); extern void scsi_release_request(struct scsi_request *); extern void scsi_wait_req(struct scsi_request *, const void *cmnd, void *buffer, unsigned bufflen, From 87b750dc4b7109aa744e7d331dc93df3fe5c1c28 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:13 -0400 Subject: [PATCH 120/126] [PATCH] gfp_t: drivers/infiniband Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/mthca/mthca_cmd.c | 2 +- drivers/infiniband/hw/mthca/mthca_cmd.h | 2 +- drivers/infiniband/hw/mthca/mthca_memfree.c | 2 +- drivers/infiniband/hw/mthca/mthca_memfree.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index f6a8ac026557..378646b5a1b8 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -524,7 +524,7 @@ void mthca_cmd_use_polling(struct mthca_dev *dev) } struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, - unsigned int gfp_mask) + gfp_t gfp_mask) { struct mthca_mailbox *mailbox; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h index 65f976a13e02..18175bec84c2 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.h +++ b/drivers/infiniband/hw/mthca/mthca_cmd.h @@ -248,7 +248,7 @@ void mthca_cmd_event(struct mthca_dev *dev, u16 token, u8 status, u64 out_param); struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, - unsigned int gfp_mask); + gfp_t gfp_mask); void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox); int mthca_SYS_EN(struct mthca_dev *dev, u8 *status); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 7bd7a4bec7b4..9ad8b3b6cfef 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -82,7 +82,7 @@ void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm) } struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, - unsigned int gfp_mask) + gfp_t gfp_mask) { struct mthca_icm *icm; struct mthca_icm_chunk *chunk = NULL; diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h index bafa51544aa3..29433f295253 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.h +++ b/drivers/infiniband/hw/mthca/mthca_memfree.h @@ -77,7 +77,7 @@ struct mthca_icm_iter { struct mthca_dev; struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, - unsigned int gfp_mask); + gfp_t gfp_mask); void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm); struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, From 1ef64e670e3bc27e0c3c83810ca36e19924c35c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:18 -0400 Subject: [PATCH 121/126] [PATCH] gfp_t: sound Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/ppc/8xx_io/cs4218.h | 2 +- arch/ppc/8xx_io/cs4218_tdm.c | 4 ++-- include/sound/memalloc.h | 2 +- sound/core/memalloc.c | 4 ++-- sound/core/seq/instr/ainstr_gf1.c | 5 +++-- sound/core/seq/instr/ainstr_iw.c | 4 ++-- sound/core/seq/instr/ainstr_simple.c | 3 ++- sound/oss/dmasound/dmasound.h | 2 +- sound/oss/dmasound/dmasound_atari.c | 4 ++-- sound/oss/dmasound/dmasound_awacs.c | 4 ++-- sound/oss/dmasound/dmasound_paula.c | 4 ++-- sound/oss/dmasound/dmasound_q40.c | 4 ++-- 12 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/ppc/8xx_io/cs4218.h b/arch/ppc/8xx_io/cs4218.h index a3c38c5a5db2..f1c7392255f8 100644 --- a/arch/ppc/8xx_io/cs4218.h +++ b/arch/ppc/8xx_io/cs4218.h @@ -78,7 +78,7 @@ typedef struct { const char *name2; void (*open)(void); void (*release)(void); - void *(*dma_alloc)(unsigned int, int); + void *(*dma_alloc)(unsigned int, gfp_t); void (*dma_free)(void *, unsigned int); int (*irqinit)(void); #ifdef MODULE diff --git a/arch/ppc/8xx_io/cs4218_tdm.c b/arch/ppc/8xx_io/cs4218_tdm.c index 2ca9ec7ec3a7..532caa388dc2 100644 --- a/arch/ppc/8xx_io/cs4218_tdm.c +++ b/arch/ppc/8xx_io/cs4218_tdm.c @@ -318,7 +318,7 @@ struct cs_sound_settings { static struct cs_sound_settings sound; -static void *CS_Alloc(unsigned int size, int flags); +static void *CS_Alloc(unsigned int size, gfp_t flags); static void CS_Free(void *ptr, unsigned int size); static int CS_IrqInit(void); #ifdef MODULE @@ -959,7 +959,7 @@ static TRANS transCSNormalRead = { /*** Low level stuff *********************************************************/ -static void *CS_Alloc(unsigned int size, int flags) +static void *CS_Alloc(unsigned int size, gfp_t flags) { int order; diff --git a/include/sound/memalloc.h b/include/sound/memalloc.h index 3a2fd2cc9f19..83489c3abbaf 100644 --- a/include/sound/memalloc.h +++ b/include/sound/memalloc.h @@ -111,7 +111,7 @@ size_t snd_dma_get_reserved_buf(struct snd_dma_buffer *dmab, unsigned int id); int snd_dma_reserve_buf(struct snd_dma_buffer *dmab, unsigned int id); /* basic memory allocation functions */ -void *snd_malloc_pages(size_t size, unsigned int gfp_flags); +void *snd_malloc_pages(size_t size, gfp_t gfp_flags); void snd_free_pages(void *ptr, size_t size); #endif /* __SOUND_MEMALLOC_H */ diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index e72cec77f0db..129abab5ce98 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -190,7 +190,7 @@ static void unmark_pages(struct page *page, int order) * * Returns the pointer of the buffer, or NULL if no enoguh memory. */ -void *snd_malloc_pages(size_t size, unsigned int gfp_flags) +void *snd_malloc_pages(size_t size, gfp_t gfp_flags) { int pg; void *res; @@ -235,7 +235,7 @@ static void *snd_malloc_dev_pages(struct device *dev, size_t size, dma_addr_t *d { int pg; void *res; - unsigned int gfp_flags; + gfp_t gfp_flags; snd_assert(size > 0, return NULL); snd_assert(dma != NULL, return NULL); diff --git a/sound/core/seq/instr/ainstr_gf1.c b/sound/core/seq/instr/ainstr_gf1.c index 207c2c54bf1d..0e4df8826eed 100644 --- a/sound/core/seq/instr/ainstr_gf1.c +++ b/sound/core/seq/instr/ainstr_gf1.c @@ -51,7 +51,7 @@ static int snd_seq_gf1_copy_wave_from_stream(snd_gf1_ops_t *ops, gf1_wave_t *wp, *prev; gf1_xwave_t xp; int err; - unsigned int gfp_mask; + gfp_t gfp_mask; unsigned int real_size; gfp_mask = atomic ? GFP_ATOMIC : GFP_KERNEL; @@ -144,7 +144,8 @@ static int snd_seq_gf1_put(void *private_data, snd_seq_kinstr_t *instr, snd_gf1_ops_t *ops = (snd_gf1_ops_t *)private_data; gf1_instrument_t *ip; gf1_xinstrument_t ix; - int err, gfp_mask; + int err; + gfp_t gfp_mask; if (cmd != SNDRV_SEQ_INSTR_PUT_CMD_CREATE) return -EINVAL; diff --git a/sound/core/seq/instr/ainstr_iw.c b/sound/core/seq/instr/ainstr_iw.c index 67c24c8e8e7b..7c19fbbc5d0f 100644 --- a/sound/core/seq/instr/ainstr_iw.c +++ b/sound/core/seq/instr/ainstr_iw.c @@ -129,7 +129,7 @@ static int snd_seq_iwffff_copy_wave_from_stream(snd_iwffff_ops_t *ops, iwffff_wave_t *wp, *prev; iwffff_xwave_t xp; int err; - unsigned int gfp_mask; + gfp_t gfp_mask; unsigned int real_size; gfp_mask = atomic ? GFP_ATOMIC : GFP_KERNEL; @@ -236,7 +236,7 @@ static int snd_seq_iwffff_put(void *private_data, snd_seq_kinstr_t *instr, iwffff_layer_t *lp, *prev_lp; iwffff_xlayer_t lx; int err; - unsigned int gfp_mask; + gfp_t gfp_mask; if (cmd != SNDRV_SEQ_INSTR_PUT_CMD_CREATE) return -EINVAL; diff --git a/sound/core/seq/instr/ainstr_simple.c b/sound/core/seq/instr/ainstr_simple.c index 6183d2151034..17ab94e76073 100644 --- a/sound/core/seq/instr/ainstr_simple.c +++ b/sound/core/seq/instr/ainstr_simple.c @@ -57,7 +57,8 @@ static int snd_seq_simple_put(void *private_data, snd_seq_kinstr_t *instr, snd_simple_ops_t *ops = (snd_simple_ops_t *)private_data; simple_instrument_t *ip; simple_xinstrument_t ix; - int err, gfp_mask; + int err; + gfp_t gfp_mask; unsigned int real_size; if (cmd != SNDRV_SEQ_INSTR_PUT_CMD_CREATE) diff --git a/sound/oss/dmasound/dmasound.h b/sound/oss/dmasound/dmasound.h index 9a2f50f0b184..222014cafc1a 100644 --- a/sound/oss/dmasound/dmasound.h +++ b/sound/oss/dmasound/dmasound.h @@ -116,7 +116,7 @@ typedef struct { const char *name; const char *name2; struct module *owner; - void *(*dma_alloc)(unsigned int, int); + void *(*dma_alloc)(unsigned int, gfp_t); void (*dma_free)(void *, unsigned int); int (*irqinit)(void); #ifdef MODULE diff --git a/sound/oss/dmasound/dmasound_atari.c b/sound/oss/dmasound/dmasound_atari.c index 8daaf87664ba..59eb53f89318 100644 --- a/sound/oss/dmasound/dmasound_atari.c +++ b/sound/oss/dmasound/dmasound_atari.c @@ -114,7 +114,7 @@ static ssize_t ata_ctx_u16le(const u_char *userPtr, size_t userCount, /*** Low level stuff *********************************************************/ -static void *AtaAlloc(unsigned int size, int flags); +static void *AtaAlloc(unsigned int size, gfp_t flags); static void AtaFree(void *, unsigned int size); static int AtaIrqInit(void); #ifdef MODULE @@ -810,7 +810,7 @@ static TRANS transFalconExpanding = { * Atari (TT/Falcon) */ -static void *AtaAlloc(unsigned int size, int flags) +static void *AtaAlloc(unsigned int size, gfp_t flags) { return atari_stram_alloc(size, "dmasound"); } diff --git a/sound/oss/dmasound/dmasound_awacs.c b/sound/oss/dmasound/dmasound_awacs.c index 2ceb46f1d40f..b2bf8bac842d 100644 --- a/sound/oss/dmasound/dmasound_awacs.c +++ b/sound/oss/dmasound/dmasound_awacs.c @@ -271,7 +271,7 @@ int expand_read_bal; /* Balance factor for expanding reads (not volume!) */ /*** Low level stuff *********************************************************/ -static void *PMacAlloc(unsigned int size, int flags); +static void *PMacAlloc(unsigned int size, gfp_t flags); static void PMacFree(void *ptr, unsigned int size); static int PMacIrqInit(void); #ifdef MODULE @@ -614,7 +614,7 @@ tas_init_frame_rates(unsigned int *prop, unsigned int l) /* * PCI PowerMac, with AWACS, Screamer, Burgundy, DACA or Tumbler and DBDMA. */ -static void *PMacAlloc(unsigned int size, int flags) +static void *PMacAlloc(unsigned int size, gfp_t flags) { return kmalloc(size, flags); } diff --git a/sound/oss/dmasound/dmasound_paula.c b/sound/oss/dmasound/dmasound_paula.c index 558db5311e06..d59f60b26410 100644 --- a/sound/oss/dmasound/dmasound_paula.c +++ b/sound/oss/dmasound/dmasound_paula.c @@ -69,7 +69,7 @@ static int write_sq_block_size_half, write_sq_block_size_quarter; /*** Low level stuff *********************************************************/ -static void *AmiAlloc(unsigned int size, int flags); +static void *AmiAlloc(unsigned int size, gfp_t flags); static void AmiFree(void *obj, unsigned int size); static int AmiIrqInit(void); #ifdef MODULE @@ -317,7 +317,7 @@ static inline void StopDMA(void) enable_heartbeat(); } -static void *AmiAlloc(unsigned int size, int flags) +static void *AmiAlloc(unsigned int size, gfp_t flags) { return amiga_chip_alloc((long)size, "dmasound [Paula]"); } diff --git a/sound/oss/dmasound/dmasound_q40.c b/sound/oss/dmasound/dmasound_q40.c index 92c25a0174db..1ddaa6284b08 100644 --- a/sound/oss/dmasound/dmasound_q40.c +++ b/sound/oss/dmasound/dmasound_q40.c @@ -36,7 +36,7 @@ static int expand_data; /* Data for expanding */ /*** Low level stuff *********************************************************/ -static void *Q40Alloc(unsigned int size, int flags); +static void *Q40Alloc(unsigned int size, gfp_t flags); static void Q40Free(void *, unsigned int); static int Q40IrqInit(void); #ifdef MODULE @@ -358,7 +358,7 @@ static TRANS transQ40Compressing = { /*** Low level stuff *********************************************************/ -static void *Q40Alloc(unsigned int size, int flags) +static void *Q40Alloc(unsigned int size, gfp_t flags) { return kmalloc(size, flags); /* change to vmalloc */ } From 53f9fc93f90a43701d6aaf3919be0614bb088b83 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:24 -0400 Subject: [PATCH 122/126] [PATCH] gfp_t: remaining bits of arch/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/alpha/kernel/pci_iommu.c | 2 +- arch/ia64/sn/kernel/xpc.h | 2 +- arch/ppc/mm/pgtable.c | 4 ++-- arch/sparc64/solaris/socksys.c | 2 +- arch/sparc64/solaris/timod.c | 2 +- arch/um/kernel/mem.c | 2 +- arch/um/kernel/process_kern.c | 2 +- include/asm-um/page.h | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 7cb23f12ecbd..c468e312e5f8 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -397,7 +397,7 @@ pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp) { void *cpu_addr; long order = get_order(size); - int gfp = GFP_ATOMIC; + gfp_t gfp = GFP_ATOMIC; try_again: cpu_addr = (void *)__get_free_pages(gfp, order); diff --git a/arch/ia64/sn/kernel/xpc.h b/arch/ia64/sn/kernel/xpc.h index d0ee635daf2e..e5f5a4e51f70 100644 --- a/arch/ia64/sn/kernel/xpc.h +++ b/arch/ia64/sn/kernel/xpc.h @@ -939,7 +939,7 @@ xpc_map_bte_errors(bte_result_t error) static inline void * -xpc_kmalloc_cacheline_aligned(size_t size, int flags, void **base) +xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) { /* see if kmalloc will give us cachline aligned memory by default */ *base = kmalloc(size, flags); diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c index 81a3d7446d37..43505b1fc5d8 100644 --- a/arch/ppc/mm/pgtable.c +++ b/arch/ppc/mm/pgtable.c @@ -114,9 +114,9 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *ptepage; #ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT; + gfp_t flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT; #else - int flags = GFP_KERNEL | __GFP_REPEAT; + gfp_t flags = GFP_KERNEL | __GFP_REPEAT; #endif ptepage = alloc_pages(flags, 0); diff --git a/arch/sparc64/solaris/socksys.c b/arch/sparc64/solaris/socksys.c index d7c1c76582cc..fc6669e8dde1 100644 --- a/arch/sparc64/solaris/socksys.c +++ b/arch/sparc64/solaris/socksys.c @@ -49,7 +49,7 @@ IPPROTO_EGP, IPPROTO_PUP, IPPROTO_UDP, IPPROTO_IDP, IPPROTO_RAW, #else -extern void * mykmalloc(size_t s, int gfp); +extern void * mykmalloc(size_t s, gfp_t gfp); extern void mykfree(void *); #endif diff --git a/arch/sparc64/solaris/timod.c b/arch/sparc64/solaris/timod.c index aaad29c35c83..b84e5456b025 100644 --- a/arch/sparc64/solaris/timod.c +++ b/arch/sparc64/solaris/timod.c @@ -39,7 +39,7 @@ static char * page = NULL ; #else -void * mykmalloc(size_t s, int gfp) +void * mykmalloc(size_t s, gfp_t gfp) { static char * page; static size_t free; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index ea008b031a8f..462cc9d65386 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -252,7 +252,7 @@ void paging_init(void) #endif } -struct page *arch_validate(struct page *page, int mask, int order) +struct page *arch_validate(struct page *page, gfp_t mask, int order) { unsigned long addr, zero = 0; int i; diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c index ea65db679e9c..0d73ceeece72 100644 --- a/arch/um/kernel/process_kern.c +++ b/arch/um/kernel/process_kern.c @@ -80,7 +80,7 @@ void free_stack(unsigned long stack, int order) unsigned long alloc_stack(int order, int atomic) { unsigned long page; - int flags = GFP_KERNEL; + gfp_t flags = GFP_KERNEL; if (atomic) flags = GFP_ATOMIC; diff --git a/include/asm-um/page.h b/include/asm-um/page.h index 2c192abe9aeb..0229814af31e 100644 --- a/include/asm-um/page.h +++ b/include/asm-um/page.h @@ -115,7 +115,7 @@ extern unsigned long uml_physmem; #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) -extern struct page *arch_validate(struct page *page, int mask, int order); +extern struct page *arch_validate(struct page *page, gfp_t mask, int order); #define HAVE_ARCH_VALIDATE extern void arch_free_page(struct page *page, int order); From 9e24974db6b01ec067c24de09588282b6a1407f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:29 -0400 Subject: [PATCH 123/126] [PATCH] gfp_t: drivers/net Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/net/cassini.c | 4 ++-- drivers/net/lance.c | 4 ++-- drivers/net/myri_sbus.c | 2 +- drivers/net/myri_sbus.h | 2 +- drivers/net/sunbmac.c | 3 ++- drivers/net/sunbmac.h | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index 2e617424d3fb..50f43dbf31ae 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -489,7 +489,7 @@ static int cas_page_free(struct cas *cp, cas_page_t *page) /* local page allocation routines for the receive buffers. jumbo pages * require at least 8K contiguous and 8K aligned buffers. */ -static cas_page_t *cas_page_alloc(struct cas *cp, const int flags) +static cas_page_t *cas_page_alloc(struct cas *cp, const gfp_t flags) { cas_page_t *page; @@ -561,7 +561,7 @@ static void cas_spare_free(struct cas *cp) } /* replenish spares if needed */ -static void cas_spare_recover(struct cas *cp, const int flags) +static void cas_spare_recover(struct cas *cp, const gfp_t flags) { struct list_head list, *elem, *tmp; int needed, i; diff --git a/drivers/net/lance.c b/drivers/net/lance.c index b4929beb33b2..1d75ca0bb939 100644 --- a/drivers/net/lance.c +++ b/drivers/net/lance.c @@ -298,7 +298,7 @@ enum {OLD_LANCE = 0, PCNET_ISA=1, PCNET_ISAP=2, PCNET_PCI=3, PCNET_VLB=4, PCNET_ static unsigned char lance_need_isa_bounce_buffers = 1; static int lance_open(struct net_device *dev); -static void lance_init_ring(struct net_device *dev, int mode); +static void lance_init_ring(struct net_device *dev, gfp_t mode); static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev); static int lance_rx(struct net_device *dev); static irqreturn_t lance_interrupt(int irq, void *dev_id, struct pt_regs *regs); @@ -846,7 +846,7 @@ lance_purge_ring(struct net_device *dev) /* Initialize the LANCE Rx and Tx rings. */ static void -lance_init_ring(struct net_device *dev, int gfp) +lance_init_ring(struct net_device *dev, gfp_t gfp) { struct lance_private *lp = dev->priv; int i; diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c index f0996ce5c268..6c86dca62e2a 100644 --- a/drivers/net/myri_sbus.c +++ b/drivers/net/myri_sbus.c @@ -277,7 +277,7 @@ static void myri_init_rings(struct myri_eth *mp, int from_irq) struct recvq __iomem *rq = mp->rq; struct myri_rxd __iomem *rxd = &rq->myri_rxd[0]; struct net_device *dev = mp->dev; - int gfp_flags = GFP_KERNEL; + gfp_t gfp_flags = GFP_KERNEL; int i; if (from_irq || in_interrupt()) diff --git a/drivers/net/myri_sbus.h b/drivers/net/myri_sbus.h index 9391e55a5e92..47722f708a41 100644 --- a/drivers/net/myri_sbus.h +++ b/drivers/net/myri_sbus.h @@ -296,7 +296,7 @@ struct myri_eth { /* We use this to acquire receive skb's that we can DMA directly into. */ #define ALIGNED_RX_SKB_ADDR(addr) \ ((((unsigned long)(addr) + (64 - 1)) & ~(64 - 1)) - (unsigned long)(addr)) -static inline struct sk_buff *myri_alloc_skb(unsigned int length, int gfp_flags) +static inline struct sk_buff *myri_alloc_skb(unsigned int length, gfp_t gfp_flags) { struct sk_buff *skb; diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c index f88f5e32b714..cfaf47c63c58 100644 --- a/drivers/net/sunbmac.c +++ b/drivers/net/sunbmac.c @@ -214,7 +214,8 @@ static void bigmac_init_rings(struct bigmac *bp, int from_irq) { struct bmac_init_block *bb = bp->bmac_block; struct net_device *dev = bp->dev; - int i, gfp_flags = GFP_KERNEL; + int i; + gfp_t gfp_flags = GFP_KERNEL; if (from_irq || in_interrupt()) gfp_flags = GFP_ATOMIC; diff --git a/drivers/net/sunbmac.h b/drivers/net/sunbmac.h index 5674003fc38a..b0dbc5187143 100644 --- a/drivers/net/sunbmac.h +++ b/drivers/net/sunbmac.h @@ -339,7 +339,7 @@ struct bigmac { #define ALIGNED_RX_SKB_ADDR(addr) \ ((((unsigned long)(addr) + (64 - 1)) & ~(64 - 1)) - (unsigned long)(addr)) -static inline struct sk_buff *big_mac_alloc_skb(unsigned int length, int gfp_flags) +static inline struct sk_buff *big_mac_alloc_skb(unsigned int length, gfp_t gfp_flags) { struct sk_buff *skb; From b4e3ca1ab1ae9ae86134126dcdc88da1caaa32ca Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:34 -0400 Subject: [PATCH 124/126] [PATCH] gfp_t: remaining bits of drivers/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 2 +- drivers/block/rd.c | 2 +- drivers/char/n_tty.c | 2 +- drivers/ieee1394/eth1394.c | 2 +- drivers/md/bitmap.c | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/s390/net/fsm.c | 2 +- drivers/s390/net/fsm.h | 2 +- include/linux/i2o.h | 4 ++-- include/linux/loop.h | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b35e08876dd4..96c664af8d06 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -881,7 +881,7 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) { struct file *filp = lo->lo_backing_file; - int gfp = lo->old_gfp_mask; + gfp_t gfp = lo->old_gfp_mask; if (lo->lo_state != Lo_bound) return -ENXIO; diff --git a/drivers/block/rd.c b/drivers/block/rd.c index 145c1fbffe01..68c60a5bcdab 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -348,7 +348,7 @@ static int rd_open(struct inode *inode, struct file *filp) struct block_device *bdev = inode->i_bdev; struct address_space *mapping; unsigned bsize; - int gfp_mask; + gfp_t gfp_mask; inode = igrab(bdev->bd_inode); rd_bdev[unit] = bdev; diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c index c9bdf544ed2c..c556f4d3ccd7 100644 --- a/drivers/char/n_tty.c +++ b/drivers/char/n_tty.c @@ -62,7 +62,7 @@ static inline unsigned char *alloc_buf(void) { - unsigned int prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; + gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; if (PAGE_SIZE != N_TTY_BUF_SIZE) return kmalloc(N_TTY_BUF_SIZE, prio); diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c index 4802bbbb6dc9..c9e92d85c893 100644 --- a/drivers/ieee1394/eth1394.c +++ b/drivers/ieee1394/eth1394.c @@ -1630,7 +1630,7 @@ static void ether1394_complete_cb(void *__ptask) /* Transmit a packet (called by kernel) */ static int ether1394_tx (struct sk_buff *skb, struct net_device *dev) { - int kmflags = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; + gfp_t kmflags = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; struct eth1394hdr *eth; struct eth1394_priv *priv = netdev_priv(dev); int proto; diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2fba2bbe72d8..01654fcabc52 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -91,7 +91,7 @@ int bitmap_active(struct bitmap *bitmap) #define WRITE_POOL_SIZE 256 /* mempool for queueing pending writes on the bitmap file */ -static void *write_pool_alloc(unsigned int gfp_flags, void *data) +static void *write_pool_alloc(gfp_t gfp_flags, void *data) { return kmalloc(sizeof(struct page_list), gfp_flags); } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b6148f6f7836..28c1a628621f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -331,7 +331,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, { struct bio *bio; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - int gfp_mask = GFP_NOIO | __GFP_HIGHMEM; + gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; unsigned int i; /* diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c index fa09440d82e5..38f50b7129a2 100644 --- a/drivers/s390/net/fsm.c +++ b/drivers/s390/net/fsm.c @@ -16,7 +16,7 @@ MODULE_LICENSE("GPL"); fsm_instance * init_fsm(char *name, const char **state_names, const char **event_names, int nr_states, - int nr_events, const fsm_node *tmpl, int tmpl_len, int order) + int nr_events, const fsm_node *tmpl, int tmpl_len, gfp_t order) { int i; fsm_instance *this; diff --git a/drivers/s390/net/fsm.h b/drivers/s390/net/fsm.h index f9a011001eb6..1b8a7e7c34f3 100644 --- a/drivers/s390/net/fsm.h +++ b/drivers/s390/net/fsm.h @@ -110,7 +110,7 @@ extern fsm_instance * init_fsm(char *name, const char **state_names, const char **event_names, int nr_states, int nr_events, const fsm_node *tmpl, - int tmpl_len, int order); + int tmpl_len, gfp_t order); /** * Releases an FSM diff --git a/include/linux/i2o.h b/include/linux/i2o.h index bdc286ec947c..b4af45aad25d 100644 --- a/include/linux/i2o.h +++ b/include/linux/i2o.h @@ -492,7 +492,7 @@ static inline int i2o_dma_map_sg(struct i2o_controller *c, * Returns 0 on success or -ENOMEM on failure. */ static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, - size_t len, unsigned int gfp_mask) + size_t len, gfp_t gfp_mask) { struct pci_dev *pdev = to_pci_dev(dev); int dma_64 = 0; @@ -551,7 +551,7 @@ static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr) * Returns the 0 on success or negative error code on failure. */ static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr, - size_t len, unsigned int gfp_mask) + size_t len, gfp_t gfp_mask) { i2o_dma_free(dev, addr); diff --git a/include/linux/loop.h b/include/linux/loop.h index 53fa51595443..40f63c9879d2 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -52,7 +52,7 @@ struct loop_device { unsigned lo_blocksize; void *key_data; - int old_gfp_mask; + gfp_t old_gfp_mask; spinlock_t lo_lock; struct bio *lo_bio; From c4cdd038318863e912e9b992489f61497f98b442 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:39 -0400 Subject: [PATCH 125/126] [PATCH] gfp_t: reiserfs mapping_set_gfp_mask() use Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 87ac9dc8b381..72e120798677 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -453,7 +453,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n) struct page *page; /* We can deadlock if we try to free dentries, and an unlink/rmdir has just occured - GFP_NOFS avoids this */ - mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; + mapping_set_gfp_mask(mapping, GFP_NOFS); page = read_cache_page(mapping, n, (filler_t *) mapping->a_ops->readpage, NULL); if (!IS_ERR(page)) { From 260b23674fdb570f3235ce55892246bef1c24c2a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Oct 2005 03:22:44 -0400 Subject: [PATCH 126/126] [PATCH] gfp_t: the rest zone handling, mapping->flags handling Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- include/linux/pagemap.h | 7 ++++--- mm/highmem.c | 14 +++++++++----- mm/page_alloc.c | 29 +++++++++++++++-------------- 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5ed471b58f4f..7519eb4191e7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -302,7 +302,7 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive, void build_all_zonelists(void); void wakeup_kswapd(struct zone *zone, int order); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int alloc_type, int can_try_harder, int gfp_high); + int alloc_type, int can_try_harder, gfp_t gfp_high); #ifdef CONFIG_HAVE_MEMORY_PRESENT void memory_present(int nid, unsigned long start, unsigned long end); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index efbae53fb078..ba6c310a055f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -21,16 +21,17 @@ static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { - return mapping->flags & __GFP_BITS_MASK; + return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; } /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... */ -static inline void mapping_set_gfp_mask(struct address_space *m, int mask) +static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { - m->flags = (m->flags & ~__GFP_BITS_MASK) | mask; + m->flags = (m->flags & ~(__force unsigned long)__GFP_BITS_MASK) | + (__force unsigned long)mask; } /* diff --git a/mm/highmem.c b/mm/highmem.c index 90e1861e2da0..ce2e7e8bbfa7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -30,11 +30,9 @@ static mempool_t *page_pool, *isa_page_pool; -static void *page_pool_alloc(gfp_t gfp_mask, void *data) +static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data) { - unsigned int gfp = gfp_mask | (unsigned int) (long) data; - - return alloc_page(gfp); + return alloc_page(gfp_mask | GFP_DMA); } static void page_pool_free(void *page, void *data) @@ -51,6 +49,12 @@ static void page_pool_free(void *page, void *data) * n means that there are (n-1) current users of it. */ #ifdef CONFIG_HIGHMEM + +static void *page_pool_alloc(gfp_t gfp_mask, void *data) +{ + return alloc_page(gfp_mask); +} + static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); @@ -267,7 +271,7 @@ int init_emergency_isa_pool(void) if (isa_page_pool) return 0; - isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); + isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL); if (!isa_page_pool) BUG(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa43ae3ab8c9..94c864eac9c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -734,7 +734,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) * of the allocation. */ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int can_try_harder, int gfp_high) + int classzone_idx, int can_try_harder, gfp_t gfp_high) { /* free_pages my go negative - that's OK */ long min = mark, free_pages = z->free_pages - (1 << order) + 1; @@ -777,7 +777,7 @@ struct page * fastcall __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { - const int wait = gfp_mask & __GFP_WAIT; + const gfp_t wait = gfp_mask & __GFP_WAIT; struct zone **zones, *z; struct page *page; struct reclaim_state reclaim_state; @@ -996,7 +996,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) * get_zeroed_page() returns a 32-bit address, which cannot represent * a highmem page */ - BUG_ON(gfp_mask & __GFP_HIGHMEM); + BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask | __GFP_ZERO, 0); if (page) @@ -1428,6 +1428,16 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli return j; } +static inline int highest_zone(int zone_bits) +{ + int res = ZONE_NORMAL; + if (zone_bits & (__force int)__GFP_HIGHMEM) + res = ZONE_HIGHMEM; + if (zone_bits & (__force int)__GFP_DMA) + res = ZONE_DMA; + return res; +} + #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) static int __initdata node_load[MAX_NUMNODES]; @@ -1524,11 +1534,7 @@ static void __init build_zonelists(pg_data_t *pgdat) zonelist = pgdat->node_zonelists + i; for (j = 0; zonelist->zones[j] != NULL; j++); - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; + k = highest_zone(i); j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); zonelist->zones[j] = NULL; @@ -1549,12 +1555,7 @@ static void __init build_zonelists(pg_data_t *pgdat) zonelist = pgdat->node_zonelists + i; j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - + k = highest_zone(i); j = build_zonelists_node(pgdat, zonelist, j, k); /* * Now we build the zonelist so that it contains the zones