btrfs: introduce end_bio_subpage_eb_writepage() function

The new function, end_bio_subpage_eb_writepage(), will handle the
metadata writeback endio.

The major differences involved are:

- How to grab extent buffer
  Now page::private is a pointer to btrfs_subpage, we can no longer grab
  extent buffer directly.
  Thus we need to use the bv_offset to locate the extent buffer manually
  and iterate through the whole range.

- Use btrfs_subpage_end_writeback() caller
  This helper will handle the subpage writeback for us.

Since this function is executed under endio context, when grabbing
extent buffers it can't grab eb->refs_lock as that lock is not designed
to be grabbed under hardirq context.

So here introduce a helper, find_extent_buffer_nolock(), for such
situation, and convert find_extent_buffer() to use that helper.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Qu Wenruo 2021-04-06 08:36:00 +08:00 committed by David Sterba
parent fb686c6824
commit 2f3186d8ee
1 changed files with 106 additions and 29 deletions

View File

@ -4080,13 +4080,98 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
} }
} }
/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
}
rcu_read_unlock();
return NULL;
}
/*
* The endio function for subpage extent buffer write.
*
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
struct bio *bio)
{
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
u64 bvec_start = page_offset(page) + bvec->bv_offset;
u64 bvec_end = bvec_start + bvec->bv_len - 1;
u64 cur_bytenr = bvec_start;
ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
/* Iterate through all extent buffers in the range */
while (cur_bytenr <= bvec_end) {
struct extent_buffer *eb;
int done;
/*
* Here we can't use find_extent_buffer(), as it may
* try to lock eb->refs_lock, which is not safe in endio
* context.
*/
eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
ASSERT(eb);
cur_bytenr = eb->start + eb->len;
ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
done = atomic_dec_and_test(&eb->io_pages);
ASSERT(done);
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page, eb);
}
btrfs_subpage_clear_writeback(fs_info, page, eb->start,
eb->len);
end_extent_buffer_writeback(eb);
/*
* free_extent_buffer() will grab spinlock which is not
* safe in endio context. Thus here we manually dec
* the ref.
*/
atomic_dec(&eb->refs);
}
}
bio_put(bio);
}
static void end_bio_extent_buffer_writepage(struct bio *bio) static void end_bio_extent_buffer_writepage(struct bio *bio)
{ {
struct btrfs_fs_info *fs_info;
struct bio_vec *bvec; struct bio_vec *bvec;
struct extent_buffer *eb; struct extent_buffer *eb;
int done; int done;
struct bvec_iter_all iter_all; struct bvec_iter_all iter_all;
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
if (fs_info->sectorsize < PAGE_SIZE)
return end_bio_subpage_eb_writepage(fs_info, bio);
ASSERT(!bio_flagged(bio, BIO_CLONED)); ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) { bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page; struct page *page = bvec->bv_page;
@ -5465,36 +5550,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
{ {
struct extent_buffer *eb; struct extent_buffer *eb;
rcu_read_lock(); eb = find_extent_buffer_nolock(fs_info, start);
eb = radix_tree_lookup(&fs_info->buffer_radix, if (!eb)
start >> fs_info->sectorsize_bits); return NULL;
if (eb && atomic_inc_not_zero(&eb->refs)) { /*
rcu_read_unlock(); * Lock our eb's refs_lock to avoid races with free_extent_buffer().
/* * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
* Lock our eb's refs_lock to avoid races with * another task running free_extent_buffer() might have seen that flag
* free_extent_buffer. When we get our eb it might be flagged * set, eb->refs == 2, that the buffer isn't under IO (dirty and
* with EXTENT_BUFFER_STALE and another task running * writeback flags not set) and it's still in the tree (flag
* free_extent_buffer might have seen that flag set, * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
* eb->refs == 2, that the buffer isn't under IO (dirty and * decrementing the extent buffer's reference count twice. So here we
* writeback flags not set) and it's still in the tree (flag * could race and increment the eb's reference count, clear its stale
* EXTENT_BUFFER_TREE_REF set), therefore being in the process * flag, mark it as dirty and drop our reference before the other task
* of decrementing the extent buffer's reference count twice. * finishes executing free_extent_buffer, which would later result in
* So here we could race and increment the eb's reference count, * an attempt to free an extent buffer that is dirty.
* clear its stale flag, mark it as dirty and drop our reference */
* before the other task finishes executing free_extent_buffer, if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
* which would later result in an attempt to free an extent spin_lock(&eb->refs_lock);
* buffer that is dirty. spin_unlock(&eb->refs_lock);
*/
if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
mark_extent_buffer_accessed(eb, NULL);
return eb;
} }
rcu_read_unlock(); mark_extent_buffer_accessed(eb, NULL);
return eb;
return NULL;
} }
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS