tmpfs: fix race between umount and swapoff

The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable
to umount as that in shmem_writepage().

Fix this instance by extending the protection of shmem_swaplist_mutex
right across shmem_unuse_inode(): while it's on the list, the inode cannot
be evicted (and the filesystem cannot be unmounted) without
shmem_evict_inode() taking that mutex to remove it from the list.

But since shmem_writepage() might take that mutex, we should avoid making
memory allocations or memcg charges while holding it: prepare them at the
outer level in shmem_unuse().  When mem_cgroup_cache_charge() was
originally placed, we didn't know until that point that the page from swap
was actually a shmem page; but nowadays it's noted in the swap_map, so
we're safe to charge upfront.  For the radix_tree, do as is done in
shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a
habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves
on occasion if subsequently preempted.

With the allocation and charge moved out from shmem_unuse_inode(),
we can also hold index map and info->lock over from finding the entry.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Hugh Dickins 2011-05-11 15:13:37 -07:00 committed by Linus Torvalds
parent b1dea800ac
commit 778dd893ae
1 changed files with 43 additions and 45 deletions

View File

@ -852,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_
static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
{ {
struct inode *inode; struct address_space *mapping;
unsigned long idx; unsigned long idx;
unsigned long size; unsigned long size;
unsigned long limit; unsigned long limit;
@ -875,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
if (size > SHMEM_NR_DIRECT) if (size > SHMEM_NR_DIRECT)
size = SHMEM_NR_DIRECT; size = SHMEM_NR_DIRECT;
offset = shmem_find_swp(entry, ptr, ptr+size); offset = shmem_find_swp(entry, ptr, ptr+size);
if (offset >= 0) if (offset >= 0) {
shmem_swp_balance_unmap();
goto found; goto found;
}
if (!info->i_indirect) if (!info->i_indirect)
goto lost2; goto lost2;
@ -914,11 +916,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
if (size > ENTRIES_PER_PAGE) if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE; size = ENTRIES_PER_PAGE;
offset = shmem_find_swp(entry, ptr, ptr+size); offset = shmem_find_swp(entry, ptr, ptr+size);
shmem_swp_unmap(ptr);
if (offset >= 0) { if (offset >= 0) {
shmem_dir_unmap(dir); shmem_dir_unmap(dir);
goto found; goto found;
} }
shmem_swp_unmap(ptr);
} }
} }
lost1: lost1:
@ -928,8 +930,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
return 0; return 0;
found: found:
idx += offset; idx += offset;
inode = igrab(&info->vfs_inode); ptr += offset;
spin_unlock(&info->lock);
/* /*
* Move _head_ to start search for next from here. * Move _head_ to start search for next from here.
@ -940,37 +941,18 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
*/ */
if (shmem_swaplist.next != &info->swaplist) if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist); list_move_tail(&shmem_swaplist, &info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
error = 1;
if (!inode)
goto out;
/* /*
* Charge page using GFP_KERNEL while we can wait. * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
* Charged back to the user(not to caller) when swap account is used. * but also to hold up shmem_evict_inode(): so inode cannot be freed
* add_to_page_cache() will be called with GFP_NOWAIT. * beneath us (pagelock doesn't help until the page is in pagecache).
*/ */
error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); mapping = info->vfs_inode.i_mapping;
if (error) error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
goto out; /* which does mem_cgroup_uncharge_cache_page on error */
error = radix_tree_preload(GFP_KERNEL);
if (error) {
mem_cgroup_uncharge_cache_page(page);
goto out;
}
error = 1;
spin_lock(&info->lock);
ptr = shmem_swp_entry(info, idx, NULL);
if (ptr && ptr->val == entry.val) {
error = add_to_page_cache_locked(page, inode->i_mapping,
idx, GFP_NOWAIT);
/* does mem_cgroup_uncharge_cache_page on error */
} else /* we must compensate for our precharge above */
mem_cgroup_uncharge_cache_page(page);
if (error == -EEXIST) { if (error == -EEXIST) {
struct page *filepage = find_get_page(inode->i_mapping, idx); struct page *filepage = find_get_page(mapping, idx);
error = 1; error = 1;
if (filepage) { if (filepage) {
/* /*
@ -990,14 +972,8 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
swap_free(entry); swap_free(entry);
error = 1; /* not an error, but entry was found */ error = 1; /* not an error, but entry was found */
} }
if (ptr)
shmem_swp_unmap(ptr); shmem_swp_unmap(ptr);
spin_unlock(&info->lock); spin_unlock(&info->lock);
radix_tree_preload_end();
out:
unlock_page(page);
page_cache_release(page);
iput(inode); /* allows for NULL */
return error; return error;
} }
@ -1009,6 +985,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
struct list_head *p, *next; struct list_head *p, *next;
struct shmem_inode_info *info; struct shmem_inode_info *info;
int found = 0; int found = 0;
int error;
/*
* Charge page using GFP_KERNEL while we can wait, before taking
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
* add_to_page_cache() will be called with GFP_NOWAIT.
*/
error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
if (error)
goto out;
/*
* Try to preload while we can wait, to not make a habit of
* draining atomic reserves; but don't latch on to this cpu,
* it's okay if sometimes we get rescheduled after this.
*/
error = radix_tree_preload(GFP_KERNEL);
if (error)
goto uncharge;
radix_tree_preload_end();
mutex_lock(&shmem_swaplist_mutex); mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(p, next, &shmem_swaplist) { list_for_each_safe(p, next, &shmem_swaplist) {
@ -1016,17 +1012,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
found = shmem_unuse_inode(info, entry, page); found = shmem_unuse_inode(info, entry, page);
cond_resched(); cond_resched();
if (found) if (found)
goto out; break;
} }
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
/*
* Can some race bring us here? We've been holding page lock, uncharge:
* so I think not; but would rather try again later than BUG() if (!found)
*/ mem_cgroup_uncharge_cache_page(page);
if (found < 0)
error = found;
out:
unlock_page(page); unlock_page(page);
page_cache_release(page); page_cache_release(page);
out: return error;
return (found < 0) ? found : 0;
} }
/* /*