2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2007-04-27 06:55:03 +08:00
|
|
|
* Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* This software may be freely redistributed under the terms of the
|
|
|
|
* GNU General Public License.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
*
|
2008-06-06 13:46:18 +08:00
|
|
|
* Authors: David Woodhouse <dwmw2@infradead.org>
|
2005-04-17 06:20:36 +08:00
|
|
|
* David Howells <dhowells@redhat.com>
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/init.h>
|
2007-04-27 06:55:03 +08:00
|
|
|
#include <linux/circ_buf.h>
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
#include <linux/sched.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "internal.h"
|
2007-04-27 06:55:03 +08:00
|
|
|
|
2018-06-15 22:24:50 +08:00
|
|
|
/*
|
|
|
|
* Create volume and callback interests on a server.
|
|
|
|
*/
|
|
|
|
static struct afs_cb_interest *afs_create_interest(struct afs_server *server,
|
|
|
|
struct afs_vnode *vnode)
|
|
|
|
{
|
|
|
|
struct afs_vol_interest *new_vi, *vi;
|
|
|
|
struct afs_cb_interest *new;
|
2020-03-27 23:02:44 +08:00
|
|
|
struct rb_node *parent, **pp;
|
2018-06-15 22:24:50 +08:00
|
|
|
|
|
|
|
new_vi = kzalloc(sizeof(struct afs_vol_interest), GFP_KERNEL);
|
|
|
|
if (!new_vi)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL);
|
|
|
|
if (!new) {
|
|
|
|
kfree(new_vi);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_vi->usage = 1;
|
|
|
|
new_vi->vid = vnode->volume->vid;
|
|
|
|
INIT_HLIST_HEAD(&new_vi->cb_interests);
|
|
|
|
|
|
|
|
refcount_set(&new->usage, 1);
|
|
|
|
new->sb = vnode->vfs_inode.i_sb;
|
|
|
|
new->vid = vnode->volume->vid;
|
2019-06-21 01:12:17 +08:00
|
|
|
new->server = afs_get_server(server, afs_server_trace_get_new_cbi);
|
2018-06-15 22:24:50 +08:00
|
|
|
INIT_HLIST_NODE(&new->cb_vlink);
|
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
write_seqlock(&server->cb_break_lock);
|
2018-06-15 22:24:50 +08:00
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
pp = &server->cb_volumes.rb_node;
|
|
|
|
while ((parent = *pp)) {
|
|
|
|
vi = rb_entry(parent, struct afs_vol_interest, srv_node);
|
|
|
|
if (vi->vid < new_vi->vid) {
|
|
|
|
pp = &(*pp)->rb_left;
|
|
|
|
} else if (vi->vid > new_vi->vid) {
|
|
|
|
pp = &(*pp)->rb_right;
|
|
|
|
} else {
|
|
|
|
vi->usage++;
|
|
|
|
goto found_vi;
|
|
|
|
}
|
2018-06-15 22:24:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
vi = new_vi;
|
|
|
|
new_vi = NULL;
|
2020-03-27 23:02:44 +08:00
|
|
|
rb_link_node_rcu(&vi->srv_node, parent, pp);
|
|
|
|
rb_insert_color(&vi->srv_node, &server->cb_volumes);
|
2018-06-15 22:24:50 +08:00
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
found_vi:
|
2018-06-15 22:24:50 +08:00
|
|
|
new->vol_interest = vi;
|
|
|
|
hlist_add_head(&new->cb_vlink, &vi->cb_interests);
|
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
write_sequnlock(&server->cb_break_lock);
|
2018-06-15 22:24:50 +08:00
|
|
|
kfree(new_vi);
|
|
|
|
return new;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2017-11-02 23:27:49 +08:00
|
|
|
* Set up an interest-in-callbacks record for a volume on a server and
|
|
|
|
* register it with the server.
|
2018-05-10 15:43:04 +08:00
|
|
|
* - Called with vnode->io_lock held.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2017-11-02 23:27:49 +08:00
|
|
|
int afs_register_server_cb_interest(struct afs_vnode *vnode,
|
2018-05-10 15:43:04 +08:00
|
|
|
struct afs_server_list *slist,
|
|
|
|
unsigned int index)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2018-05-10 15:43:04 +08:00
|
|
|
struct afs_server_entry *entry = &slist->servers[index];
|
|
|
|
struct afs_cb_interest *cbi, *vcbi, *new, *old;
|
afs: Overhaul volume and server record caching and fileserver rotation
The current code assumes that volumes and servers are per-cell and are
never shared, but this is not enforced, and, indeed, public cells do exist
that are aliases of each other. Further, an organisation can, say, set up
a public cell and a private cell with overlapping, but not identical, sets
of servers. The difference is purely in the database attached to the VL
servers.
The current code will malfunction if it sees a server in two cells as it
assumes global address -> server record mappings and that each server is in
just one cell.
Further, each server may have multiple addresses - and may have addresses
of different families (IPv4 and IPv6, say).
To this end, the following structural changes are made:
(1) Server record management is overhauled:
(a) Server records are made independent of cell. The namespace keeps
track of them, volume records have lists of them and each vnode
has a server on which its callback interest currently resides.
(b) The cell record no longer keeps a list of servers known to be in
that cell.
(c) The server records are now kept in a flat list because there's no
single address to sort on.
(d) Server records are now keyed by their UUID within the namespace.
(e) The addresses for a server are obtained with the VL.GetAddrsU
rather than with VL.GetEntryByName, using the server's UUID as a
parameter.
(f) Cached server records are garbage collected after a period of
non-use and are counted out of existence before purging is allowed
to complete. This protects the work functions against rmmod.
(g) The servers list is now in /proc/fs/afs/servers.
(2) Volume record management is overhauled:
(a) An RCU-replaceable server list is introduced. This tracks both
servers and their coresponding callback interests.
(b) The superblock is now keyed on cell record and numeric volume ID.
(c) The volume record is now tied to the superblock which mounts it,
and is activated when mounted and deactivated when unmounted.
This makes it easier to handle the cache cookie without causing a
double-use in fscache.
(d) The volume record is loaded from the VLDB using VL.GetEntryByNameU
to get the server UUID list.
(e) The volume name is updated if it is seen to have changed when the
volume is updated (the update is keyed on the volume ID).
(3) The vlocation record is got rid of and VLDB records are no longer
cached. Sufficient information is stored in the volume record, though
an update to a volume record is now no longer shared between related
volumes (volumes come in bundles of three: R/W, R/O and backup).
and the following procedural changes are made:
(1) The fileserver cursor introduced previously is now fleshed out and
used to iterate over fileservers and their addresses.
(2) Volume status is checked during iteration, and the server list is
replaced if a change is detected.
(3) Server status is checked during iteration, and the address list is
replaced if a change is detected.
(4) The abort code is saved into the address list cursor and -ECONNABORTED
returned in afs_make_call() if a remote abort happened rather than
translating the abort into an error message. This allows actions to
be taken depending on the abort code more easily.
(a) If a VMOVED abort is seen then this is handled by rechecking the
volume and restarting the iteration.
(b) If a VBUSY, VRESTARTING or VSALVAGING abort is seen then this is
handled by sleeping for a short period and retrying and/or trying
other servers that might serve that volume. A message is also
displayed once until the condition has cleared.
(c) If a VOFFLINE abort is seen, then this is handled as VBUSY for the
moment.
(d) If a VNOVOL abort is seen, the volume is rechecked in the VLDB to
see if it has been deleted; if not, the fileserver is probably
indicating that the volume couldn't be attached and needs
salvaging.
(e) If statfs() sees one of these aborts, it does not sleep, but
rather returns an error, so as not to block the umount program.
(5) The fileserver iteration functions in vnode.c are now merged into
their callers and more heavily macroised around the cursor. vnode.c
is removed.
(6) Operations on a particular vnode are serialised on that vnode because
the server will lock that vnode whilst it operates on it, so a second
op sent will just have to wait.
(7) Fileservers are probed with FS.GetCapabilities before being used.
This is where service upgrade will be done.
(8) A callback interest on a fileserver is set up before an FS operation
is performed and passed through to afs_make_call() so that it can be
set on the vnode if the operation returns a callback. The callback
interest is passed through to afs_iget() also so that it can be set
there too.
In general, record updating is done on an as-needed basis when we try to
access servers, volumes or vnodes rather than offloading it to work items
and special threads.
Notes:
(1) Pre AFS-3.4 servers are no longer supported, though this can be added
back if necessary (AFS-3.4 was released in 1998).
(2) VBUSY is retried forever for the moment at intervals of 1s.
(3) /proc/fs/afs/<cell>/servers no longer exists.
Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-02 23:27:50 +08:00
|
|
|
struct afs_server *server = entry->server;
|
2017-11-02 23:27:49 +08:00
|
|
|
|
|
|
|
again:
|
2019-05-13 23:14:32 +08:00
|
|
|
vcbi = rcu_dereference_protected(vnode->cb_interest,
|
|
|
|
lockdep_is_held(&vnode->io_lock));
|
|
|
|
if (vcbi && likely(vcbi == entry->cb_interest))
|
2018-05-10 15:43:04 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
read_lock(&slist->lock);
|
|
|
|
cbi = afs_get_cb_interest(entry->cb_interest);
|
|
|
|
read_unlock(&slist->lock);
|
|
|
|
|
2017-11-02 23:27:49 +08:00
|
|
|
if (vcbi) {
|
2018-05-10 15:43:04 +08:00
|
|
|
if (vcbi == cbi) {
|
|
|
|
afs_put_cb_interest(afs_v2net(vnode), cbi);
|
2017-11-02 23:27:49 +08:00
|
|
|
return 0;
|
2018-05-10 15:43:04 +08:00
|
|
|
}
|
2017-11-02 23:27:49 +08:00
|
|
|
|
2018-05-10 15:43:04 +08:00
|
|
|
/* Use a new interest in the server list for the same server
|
|
|
|
* rather than an old one that's still attached to a vnode.
|
|
|
|
*/
|
2017-11-02 23:27:49 +08:00
|
|
|
if (cbi && vcbi->server == cbi->server) {
|
|
|
|
write_seqlock(&vnode->cb_lock);
|
2019-05-13 23:14:32 +08:00
|
|
|
old = rcu_dereference_protected(vnode->cb_interest,
|
|
|
|
lockdep_is_held(&vnode->cb_lock.lock));
|
|
|
|
rcu_assign_pointer(vnode->cb_interest, cbi);
|
2017-11-02 23:27:49 +08:00
|
|
|
write_sequnlock(&vnode->cb_lock);
|
2018-05-10 15:43:04 +08:00
|
|
|
afs_put_cb_interest(afs_v2net(vnode), old);
|
2017-11-02 23:27:49 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-05-10 15:43:04 +08:00
|
|
|
/* Re-use the one attached to the vnode. */
|
2017-11-02 23:27:49 +08:00
|
|
|
if (!cbi && vcbi->server == server) {
|
2018-05-10 15:43:04 +08:00
|
|
|
write_lock(&slist->lock);
|
|
|
|
if (entry->cb_interest) {
|
|
|
|
write_unlock(&slist->lock);
|
|
|
|
afs_put_cb_interest(afs_v2net(vnode), cbi);
|
2017-11-02 23:27:49 +08:00
|
|
|
goto again;
|
|
|
|
}
|
2018-05-10 15:43:04 +08:00
|
|
|
|
|
|
|
entry->cb_interest = cbi;
|
|
|
|
write_unlock(&slist->lock);
|
2017-11-02 23:27:49 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-11-02 23:27:49 +08:00
|
|
|
if (!cbi) {
|
2018-06-15 22:24:50 +08:00
|
|
|
new = afs_create_interest(server, vnode);
|
2017-11-02 23:27:49 +08:00
|
|
|
if (!new)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-05-10 15:43:04 +08:00
|
|
|
write_lock(&slist->lock);
|
|
|
|
if (!entry->cb_interest) {
|
|
|
|
entry->cb_interest = afs_get_cb_interest(new);
|
2017-11-02 23:27:49 +08:00
|
|
|
cbi = new;
|
2018-05-10 15:43:04 +08:00
|
|
|
new = NULL;
|
2017-11-02 23:27:49 +08:00
|
|
|
} else {
|
2018-05-10 15:43:04 +08:00
|
|
|
cbi = afs_get_cb_interest(entry->cb_interest);
|
2017-11-02 23:27:49 +08:00
|
|
|
}
|
2018-05-10 15:43:04 +08:00
|
|
|
write_unlock(&slist->lock);
|
|
|
|
afs_put_cb_interest(afs_v2net(vnode), new);
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-11-02 23:27:49 +08:00
|
|
|
ASSERT(cbi);
|
|
|
|
|
|
|
|
/* Change the server the vnode is using. This entails scrubbing any
|
|
|
|
* interest the vnode had in the previous server it was using.
|
|
|
|
*/
|
|
|
|
write_seqlock(&vnode->cb_lock);
|
|
|
|
|
2019-05-13 23:14:32 +08:00
|
|
|
old = rcu_dereference_protected(vnode->cb_interest,
|
|
|
|
lockdep_is_held(&vnode->cb_lock.lock));
|
|
|
|
rcu_assign_pointer(vnode->cb_interest, cbi);
|
2017-11-02 23:27:49 +08:00
|
|
|
vnode->cb_s_break = cbi->server->cb_s_break;
|
2018-05-13 05:31:33 +08:00
|
|
|
vnode->cb_v_break = vnode->volume->cb_v_break;
|
2017-11-02 23:27:49 +08:00
|
|
|
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
|
|
|
|
|
|
|
|
write_sequnlock(&vnode->cb_lock);
|
2018-05-10 15:43:04 +08:00
|
|
|
afs_put_cb_interest(afs_v2net(vnode), old);
|
2017-11-02 23:27:49 +08:00
|
|
|
return 0;
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-11-02 23:27:49 +08:00
|
|
|
/*
|
|
|
|
* Remove an interest on a server.
|
|
|
|
*/
|
|
|
|
void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
|
|
|
|
{
|
2018-06-15 22:24:50 +08:00
|
|
|
struct afs_vol_interest *vi;
|
|
|
|
|
2017-11-02 23:27:49 +08:00
|
|
|
if (cbi && refcount_dec_and_test(&cbi->usage)) {
|
2018-06-15 22:24:50 +08:00
|
|
|
if (!hlist_unhashed(&cbi->cb_vlink)) {
|
2020-03-27 23:02:44 +08:00
|
|
|
write_seqlock(&cbi->server->cb_break_lock);
|
2018-06-15 22:24:50 +08:00
|
|
|
|
|
|
|
hlist_del_init(&cbi->cb_vlink);
|
|
|
|
vi = cbi->vol_interest;
|
|
|
|
cbi->vol_interest = NULL;
|
|
|
|
if (--vi->usage == 0)
|
2020-03-27 23:02:44 +08:00
|
|
|
rb_erase(&vi->srv_node, &cbi->server->cb_volumes);
|
2018-06-15 22:24:50 +08:00
|
|
|
else
|
|
|
|
vi = NULL;
|
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
write_sequnlock(&cbi->server->cb_break_lock);
|
2019-05-13 23:14:32 +08:00
|
|
|
if (vi)
|
|
|
|
kfree_rcu(vi, rcu);
|
2019-06-21 01:12:17 +08:00
|
|
|
afs_put_server(net, cbi->server, afs_server_trace_put_cbi);
|
2017-11-02 23:27:49 +08:00
|
|
|
}
|
2019-05-13 23:14:32 +08:00
|
|
|
kfree_rcu(cbi, rcu);
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
2017-11-02 23:27:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* allow the fileserver to request callback state (re-)initialisation
|
|
|
|
*/
|
|
|
|
void afs_init_callback_state(struct afs_server *server)
|
|
|
|
{
|
afs: Fix in-progess ops to ignore server-level callback invalidation
The in-kernel afs filesystem client counts the number of server-level
callback invalidation events (CB.InitCallBackState* RPC operations) that it
receives from the server. This is stored in cb_s_break in various
structures, including afs_server and afs_vnode.
If an inode is examined by afs_validate(), say, the afs_server copy is
compared, along with other break counters, to those in afs_vnode, and if
one or more of the counters do not match, it is considered that the
server's callback promise is broken. At points where this happens,
AFS_VNODE_CB_PROMISED is cleared to indicate that the status must be
refetched from the server.
afs_validate() issues an FS.FetchStatus operation to get updated metadata -
and based on the updated data_version may invalidate the pagecache too.
However, the break counters are also used to determine whether to note a
new callback in the vnode (which would set the AFS_VNODE_CB_PROMISED flag)
and whether to cache the permit data included in the YFSFetchStatus record
by the server.
The problem comes when the server sends us a CB.InitCallBackState op. The
first such instance doesn't cause cb_s_break to be incremented, but rather
causes AFS_SERVER_FL_NEW to be cleared - but thereafter, say some hours
after last use and all the volumes have been automatically unmounted and
the server has forgotten about the client[*], this *will* likely cause an
increment.
[*] There are other circumstances too, such as the server restarting or
needing to make space in its callback table.
Note that the server won't send us a CB.InitCallBackState op until we talk
to it again.
So what happens is:
(1) A mount for a new volume is attempted, a inode is created for the root
vnode and vnode->cb_s_break and AFS_VNODE_CB_PROMISED aren't set
immediately, as we don't have a nominated server to talk to yet - and
we may iterate through a few to find one.
(2) Before the operation happens, afs_fetch_status(), say, notes in the
cursor (fc.cb_break) the break counter sum from the vnode, volume and
server counters, but the server->cb_s_break is currently 0.
(3) We send FS.FetchStatus to the server. The server sends us back
CB.InitCallBackState. We increment server->cb_s_break.
(4) Our FS.FetchStatus completes. The reply includes a callback record.
(5) xdr_decode_AFSCallBack()/xdr_decode_YFSCallBack() check to see whether
the callback promise was broken by checking the break counter sum from
step (2) against the current sum.
This fails because of step (3), so we don't set the callback record
and, importantly, don't set AFS_VNODE_CB_PROMISED on the vnode.
This does not preclude the syscall from progressing, and we don't loop here
rechecking the status, but rather assume it's good enough for one round
only and will need to be rechecked next time.
(6) afs_validate() it triggered on the vnode, probably called from
d_revalidate() checking the parent directory.
(7) afs_validate() notes that AFS_VNODE_CB_PROMISED isn't set, so doesn't
update vnode->cb_s_break and assumes the vnode to be invalid.
(8) afs_validate() needs to calls afs_fetch_status(). Go back to step (2)
and repeat, every time the vnode is validated.
This primarily affects volume root dir vnodes. Everything subsequent to
those inherit an already incremented cb_s_break upon mounting.
The issue is that we assume that the callback record and the cached permit
information in a reply from the server can't be trusted after getting a
server break - but this is wrong since the server makes sure things are
done in the right order, holding up our ops if necessary[*].
[*] There is an extremely unlikely scenario where a reply from before the
CB.InitCallBackState could get its delivery deferred till after - at
which point we think we have a promise when we don't. This, however,
requires unlucky mass packet loss to one call.
AFS_SERVER_FL_NEW tries to paper over the cracks for the initial mount from
a server we've never contacted before, but this should be unnecessary.
It's also further insulated from the problem on an initial mount by
querying the server first with FS.GetCapabilities, which triggers the
CB.InitCallBackState.
Fix this by
(1) Remove AFS_SERVER_FL_NEW.
(2) In afs_calc_vnode_cb_break(), don't include cb_s_break in the
calculation.
(3) In afs_cb_is_broken(), don't include cb_s_break in the check.
Signed-off-by: David Howells <dhowells@redhat.com>
2019-04-13 15:37:37 +08:00
|
|
|
server->cb_s_break++;
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* actually break a callback
|
|
|
|
*/
|
2019-06-21 01:12:16 +08:00
|
|
|
void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason)
|
2007-04-27 06:55:03 +08:00
|
|
|
{
|
|
|
|
_enter("");
|
|
|
|
|
2018-04-06 21:17:26 +08:00
|
|
|
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
|
2017-11-02 23:27:49 +08:00
|
|
|
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
|
|
|
|
vnode->cb_break++;
|
|
|
|
afs_clear_permits(vnode);
|
2007-04-27 06:55:03 +08:00
|
|
|
|
2019-05-11 06:03:31 +08:00
|
|
|
if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
|
2007-07-16 14:40:12 +08:00
|
|
|
afs_lock_may_be_available(vnode);
|
2019-06-21 01:12:16 +08:00
|
|
|
|
|
|
|
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
|
|
|
|
} else {
|
|
|
|
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, false);
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
2018-10-20 07:57:58 +08:00
|
|
|
}
|
2017-11-02 23:27:49 +08:00
|
|
|
|
2019-06-21 01:12:16 +08:00
|
|
|
void afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason)
|
2018-10-20 07:57:58 +08:00
|
|
|
{
|
|
|
|
write_seqlock(&vnode->cb_lock);
|
2019-06-21 01:12:16 +08:00
|
|
|
__afs_break_callback(vnode, reason);
|
2017-11-02 23:27:49 +08:00
|
|
|
write_sequnlock(&vnode->cb_lock);
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
/*
|
|
|
|
* Look up a volume interest by volume ID under RCU conditions.
|
|
|
|
*/
|
|
|
|
static struct afs_vol_interest *afs_lookup_vol_interest_rcu(struct afs_server *server,
|
|
|
|
afs_volid_t vid)
|
|
|
|
{
|
|
|
|
struct afs_vol_interest *vi = NULL;
|
|
|
|
struct rb_node *p;
|
|
|
|
int seq = 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
/* Unfortunately, rbtree walking doesn't give reliable results
|
|
|
|
* under just the RCU read lock, so we have to check for
|
|
|
|
* changes.
|
|
|
|
*/
|
|
|
|
read_seqbegin_or_lock(&server->cb_break_lock, &seq);
|
|
|
|
|
|
|
|
p = rcu_dereference_raw(server->cb_volumes.rb_node);
|
|
|
|
while (p) {
|
|
|
|
vi = rb_entry(p, struct afs_vol_interest, srv_node);
|
|
|
|
|
|
|
|
if (vi->vid < vid)
|
|
|
|
p = rcu_dereference_raw(p->rb_left);
|
|
|
|
else if (vi->vid > vid)
|
|
|
|
p = rcu_dereference_raw(p->rb_right);
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
/* We want to repeat the search, this time with the
|
|
|
|
* lock properly locked.
|
|
|
|
*/
|
|
|
|
vi = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
} while (need_seqretry(&server->cb_break_lock, seq));
|
|
|
|
|
|
|
|
done_seqretry(&server->cb_break_lock, seq);
|
|
|
|
return vi;
|
|
|
|
}
|
|
|
|
|
2007-04-27 06:55:03 +08:00
|
|
|
/*
|
|
|
|
* allow the fileserver to explicitly break one callback
|
|
|
|
* - happens when
|
|
|
|
* - the backing file is changed
|
|
|
|
* - a lock is released
|
|
|
|
*/
|
|
|
|
static void afs_break_one_callback(struct afs_server *server,
|
2020-03-27 23:02:44 +08:00
|
|
|
struct afs_fid *fid,
|
|
|
|
struct afs_vol_interest *vi)
|
2007-04-27 06:55:03 +08:00
|
|
|
{
|
2017-11-02 23:27:49 +08:00
|
|
|
struct afs_cb_interest *cbi;
|
|
|
|
struct afs_iget_data data;
|
2007-04-27 06:55:03 +08:00
|
|
|
struct afs_vnode *vnode;
|
2017-11-02 23:27:49 +08:00
|
|
|
struct inode *inode;
|
2007-04-27 06:55:03 +08:00
|
|
|
|
2017-11-02 23:27:49 +08:00
|
|
|
/* Step through all interested superblocks. There may be more than one
|
|
|
|
* because of cell aliasing.
|
|
|
|
*/
|
2020-03-27 23:02:44 +08:00
|
|
|
hlist_for_each_entry_rcu(cbi, &vi->cb_interests, cb_vlink) {
|
2018-05-13 05:31:33 +08:00
|
|
|
if (fid->vnode == 0 && fid->unique == 0) {
|
|
|
|
/* The callback break applies to an entire volume. */
|
|
|
|
struct afs_super_info *as = AFS_FS_S(cbi->sb);
|
|
|
|
struct afs_volume *volume = as->volume;
|
|
|
|
|
2019-06-20 23:49:35 +08:00
|
|
|
write_lock(&volume->cb_v_break_lock);
|
2018-05-13 05:31:33 +08:00
|
|
|
volume->cb_v_break++;
|
2019-06-21 01:12:16 +08:00
|
|
|
trace_afs_cb_break(fid, volume->cb_v_break,
|
|
|
|
afs_cb_break_for_volume_callback, false);
|
2019-06-20 23:49:35 +08:00
|
|
|
write_unlock(&volume->cb_v_break_lock);
|
2018-05-13 05:31:33 +08:00
|
|
|
} else {
|
|
|
|
data.volume = NULL;
|
|
|
|
data.fid = *fid;
|
2017-12-01 19:40:16 +08:00
|
|
|
|
|
|
|
/* See if we can find a matching inode - even an I_NEW
|
|
|
|
* inode needs to be marked as it can have its callback
|
|
|
|
* broken before we finish setting up the local inode.
|
|
|
|
*/
|
|
|
|
inode = find_inode_rcu(cbi->sb, fid->vnode,
|
|
|
|
afs_iget5_test, &data);
|
2018-05-13 05:31:33 +08:00
|
|
|
if (inode) {
|
|
|
|
vnode = AFS_FS_I(inode);
|
2019-06-21 01:12:16 +08:00
|
|
|
afs_break_callback(vnode, afs_cb_break_for_callback);
|
|
|
|
} else {
|
|
|
|
trace_afs_cb_miss(fid, afs_cb_break_for_callback);
|
2018-05-13 05:31:33 +08:00
|
|
|
}
|
2017-11-02 23:27:49 +08:00
|
|
|
}
|
|
|
|
}
|
2020-03-27 23:02:44 +08:00
|
|
|
}
|
2007-04-27 06:55:03 +08:00
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
static void afs_break_some_callbacks(struct afs_server *server,
|
|
|
|
struct afs_callback_break *cbb,
|
|
|
|
size_t *_count)
|
|
|
|
{
|
|
|
|
struct afs_callback_break *residue = cbb;
|
|
|
|
struct afs_vol_interest *vi;
|
|
|
|
afs_volid_t vid = cbb->fid.vid;
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
vi = afs_lookup_vol_interest_rcu(server, vid);
|
|
|
|
|
|
|
|
/* TODO: Find all matching volumes if we couldn't match the server and
|
|
|
|
* break them anyway.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = *_count; i > 0; cbb++, i--) {
|
|
|
|
if (cbb->fid.vid == vid) {
|
|
|
|
_debug("- Fid { vl=%08llx n=%llu u=%u }",
|
|
|
|
cbb->fid.vid,
|
|
|
|
cbb->fid.vnode,
|
|
|
|
cbb->fid.unique);
|
|
|
|
--*_count;
|
|
|
|
if (vi)
|
|
|
|
afs_break_one_callback(server, &cbb->fid, vi);
|
|
|
|
} else {
|
|
|
|
*residue++ = *cbb;
|
|
|
|
}
|
|
|
|
}
|
2007-04-27 06:49:28 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* allow the fileserver to break callback promises
|
|
|
|
*/
|
2007-04-27 06:55:03 +08:00
|
|
|
void afs_break_callbacks(struct afs_server *server, size_t count,
|
2018-04-10 04:12:31 +08:00
|
|
|
struct afs_callback_break *callbacks)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-04-27 06:55:03 +08:00
|
|
|
_enter("%p,%zu,", server, count);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-04-27 06:55:03 +08:00
|
|
|
ASSERT(server != NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
rcu_read_lock();
|
2018-05-13 05:31:33 +08:00
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
while (count > 0)
|
|
|
|
afs_break_some_callbacks(server, callbacks, &count);
|
2007-04-27 06:55:03 +08:00
|
|
|
|
2020-03-27 23:02:44 +08:00
|
|
|
rcu_read_unlock();
|
2007-04-27 06:55:03 +08:00
|
|
|
return;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-04-27 06:55:03 +08:00
|
|
|
/*
|
2017-11-02 23:27:49 +08:00
|
|
|
* Clear the callback interests in a server list.
|
2007-04-27 06:55:03 +08:00
|
|
|
*/
|
afs: Overhaul volume and server record caching and fileserver rotation
The current code assumes that volumes and servers are per-cell and are
never shared, but this is not enforced, and, indeed, public cells do exist
that are aliases of each other. Further, an organisation can, say, set up
a public cell and a private cell with overlapping, but not identical, sets
of servers. The difference is purely in the database attached to the VL
servers.
The current code will malfunction if it sees a server in two cells as it
assumes global address -> server record mappings and that each server is in
just one cell.
Further, each server may have multiple addresses - and may have addresses
of different families (IPv4 and IPv6, say).
To this end, the following structural changes are made:
(1) Server record management is overhauled:
(a) Server records are made independent of cell. The namespace keeps
track of them, volume records have lists of them and each vnode
has a server on which its callback interest currently resides.
(b) The cell record no longer keeps a list of servers known to be in
that cell.
(c) The server records are now kept in a flat list because there's no
single address to sort on.
(d) Server records are now keyed by their UUID within the namespace.
(e) The addresses for a server are obtained with the VL.GetAddrsU
rather than with VL.GetEntryByName, using the server's UUID as a
parameter.
(f) Cached server records are garbage collected after a period of
non-use and are counted out of existence before purging is allowed
to complete. This protects the work functions against rmmod.
(g) The servers list is now in /proc/fs/afs/servers.
(2) Volume record management is overhauled:
(a) An RCU-replaceable server list is introduced. This tracks both
servers and their coresponding callback interests.
(b) The superblock is now keyed on cell record and numeric volume ID.
(c) The volume record is now tied to the superblock which mounts it,
and is activated when mounted and deactivated when unmounted.
This makes it easier to handle the cache cookie without causing a
double-use in fscache.
(d) The volume record is loaded from the VLDB using VL.GetEntryByNameU
to get the server UUID list.
(e) The volume name is updated if it is seen to have changed when the
volume is updated (the update is keyed on the volume ID).
(3) The vlocation record is got rid of and VLDB records are no longer
cached. Sufficient information is stored in the volume record, though
an update to a volume record is now no longer shared between related
volumes (volumes come in bundles of three: R/W, R/O and backup).
and the following procedural changes are made:
(1) The fileserver cursor introduced previously is now fleshed out and
used to iterate over fileservers and their addresses.
(2) Volume status is checked during iteration, and the server list is
replaced if a change is detected.
(3) Server status is checked during iteration, and the address list is
replaced if a change is detected.
(4) The abort code is saved into the address list cursor and -ECONNABORTED
returned in afs_make_call() if a remote abort happened rather than
translating the abort into an error message. This allows actions to
be taken depending on the abort code more easily.
(a) If a VMOVED abort is seen then this is handled by rechecking the
volume and restarting the iteration.
(b) If a VBUSY, VRESTARTING or VSALVAGING abort is seen then this is
handled by sleeping for a short period and retrying and/or trying
other servers that might serve that volume. A message is also
displayed once until the condition has cleared.
(c) If a VOFFLINE abort is seen, then this is handled as VBUSY for the
moment.
(d) If a VNOVOL abort is seen, the volume is rechecked in the VLDB to
see if it has been deleted; if not, the fileserver is probably
indicating that the volume couldn't be attached and needs
salvaging.
(e) If statfs() sees one of these aborts, it does not sleep, but
rather returns an error, so as not to block the umount program.
(5) The fileserver iteration functions in vnode.c are now merged into
their callers and more heavily macroised around the cursor. vnode.c
is removed.
(6) Operations on a particular vnode are serialised on that vnode because
the server will lock that vnode whilst it operates on it, so a second
op sent will just have to wait.
(7) Fileservers are probed with FS.GetCapabilities before being used.
This is where service upgrade will be done.
(8) A callback interest on a fileserver is set up before an FS operation
is performed and passed through to afs_make_call() so that it can be
set on the vnode if the operation returns a callback. The callback
interest is passed through to afs_iget() also so that it can be set
there too.
In general, record updating is done on an as-needed basis when we try to
access servers, volumes or vnodes rather than offloading it to work items
and special threads.
Notes:
(1) Pre AFS-3.4 servers are no longer supported, though this can be added
back if necessary (AFS-3.4 was released in 1998).
(2) VBUSY is retried forever for the moment at intervals of 1s.
(3) /proc/fs/afs/<cell>/servers no longer exists.
Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-02 23:27:50 +08:00
|
|
|
void afs_clear_callback_interests(struct afs_net *net, struct afs_server_list *slist)
|
2007-04-27 06:55:03 +08:00
|
|
|
{
|
2017-11-02 23:27:49 +08:00
|
|
|
int i;
|
2007-04-27 06:55:03 +08:00
|
|
|
|
afs: Overhaul volume and server record caching and fileserver rotation
The current code assumes that volumes and servers are per-cell and are
never shared, but this is not enforced, and, indeed, public cells do exist
that are aliases of each other. Further, an organisation can, say, set up
a public cell and a private cell with overlapping, but not identical, sets
of servers. The difference is purely in the database attached to the VL
servers.
The current code will malfunction if it sees a server in two cells as it
assumes global address -> server record mappings and that each server is in
just one cell.
Further, each server may have multiple addresses - and may have addresses
of different families (IPv4 and IPv6, say).
To this end, the following structural changes are made:
(1) Server record management is overhauled:
(a) Server records are made independent of cell. The namespace keeps
track of them, volume records have lists of them and each vnode
has a server on which its callback interest currently resides.
(b) The cell record no longer keeps a list of servers known to be in
that cell.
(c) The server records are now kept in a flat list because there's no
single address to sort on.
(d) Server records are now keyed by their UUID within the namespace.
(e) The addresses for a server are obtained with the VL.GetAddrsU
rather than with VL.GetEntryByName, using the server's UUID as a
parameter.
(f) Cached server records are garbage collected after a period of
non-use and are counted out of existence before purging is allowed
to complete. This protects the work functions against rmmod.
(g) The servers list is now in /proc/fs/afs/servers.
(2) Volume record management is overhauled:
(a) An RCU-replaceable server list is introduced. This tracks both
servers and their coresponding callback interests.
(b) The superblock is now keyed on cell record and numeric volume ID.
(c) The volume record is now tied to the superblock which mounts it,
and is activated when mounted and deactivated when unmounted.
This makes it easier to handle the cache cookie without causing a
double-use in fscache.
(d) The volume record is loaded from the VLDB using VL.GetEntryByNameU
to get the server UUID list.
(e) The volume name is updated if it is seen to have changed when the
volume is updated (the update is keyed on the volume ID).
(3) The vlocation record is got rid of and VLDB records are no longer
cached. Sufficient information is stored in the volume record, though
an update to a volume record is now no longer shared between related
volumes (volumes come in bundles of three: R/W, R/O and backup).
and the following procedural changes are made:
(1) The fileserver cursor introduced previously is now fleshed out and
used to iterate over fileservers and their addresses.
(2) Volume status is checked during iteration, and the server list is
replaced if a change is detected.
(3) Server status is checked during iteration, and the address list is
replaced if a change is detected.
(4) The abort code is saved into the address list cursor and -ECONNABORTED
returned in afs_make_call() if a remote abort happened rather than
translating the abort into an error message. This allows actions to
be taken depending on the abort code more easily.
(a) If a VMOVED abort is seen then this is handled by rechecking the
volume and restarting the iteration.
(b) If a VBUSY, VRESTARTING or VSALVAGING abort is seen then this is
handled by sleeping for a short period and retrying and/or trying
other servers that might serve that volume. A message is also
displayed once until the condition has cleared.
(c) If a VOFFLINE abort is seen, then this is handled as VBUSY for the
moment.
(d) If a VNOVOL abort is seen, the volume is rechecked in the VLDB to
see if it has been deleted; if not, the fileserver is probably
indicating that the volume couldn't be attached and needs
salvaging.
(e) If statfs() sees one of these aborts, it does not sleep, but
rather returns an error, so as not to block the umount program.
(5) The fileserver iteration functions in vnode.c are now merged into
their callers and more heavily macroised around the cursor. vnode.c
is removed.
(6) Operations on a particular vnode are serialised on that vnode because
the server will lock that vnode whilst it operates on it, so a second
op sent will just have to wait.
(7) Fileservers are probed with FS.GetCapabilities before being used.
This is where service upgrade will be done.
(8) A callback interest on a fileserver is set up before an FS operation
is performed and passed through to afs_make_call() so that it can be
set on the vnode if the operation returns a callback. The callback
interest is passed through to afs_iget() also so that it can be set
there too.
In general, record updating is done on an as-needed basis when we try to
access servers, volumes or vnodes rather than offloading it to work items
and special threads.
Notes:
(1) Pre AFS-3.4 servers are no longer supported, though this can be added
back if necessary (AFS-3.4 was released in 1998).
(2) VBUSY is retried forever for the moment at intervals of 1s.
(3) /proc/fs/afs/<cell>/servers no longer exists.
Signed-off-by: David Howells <dhowells@redhat.com>
2017-11-02 23:27:50 +08:00
|
|
|
for (i = 0; i < slist->nr_servers; i++) {
|
|
|
|
afs_put_cb_interest(net, slist->servers[i].cb_interest);
|
|
|
|
slist->servers[i].cb_interest = NULL;
|
2007-04-27 06:55:03 +08:00
|
|
|
}
|
|
|
|
}
|