for-5.19-rc3-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmK4dV4ACgkQxWXV+ddt WDs4uQ/7B0XqPK05NJntJfwnuIoT/yOreKf47wt/6DyFV3CDMFte/qzaZwthwu6P F0GMpSYAlVszLlML5elvF9VXymlV+e+QROtbD6QCNLNW1IwHA7ZiF5fV/a1Rj930 XSuaDyVFPAK7892RR6yMQ20IeMBuvqiAhXWEzaIJ2tIcAHn+fP+VkY8Nc0aZj3iC mI+ep4n93karDxmnHVGUxJTxAe0l/uNopx+fYBWQDj7HuoMLo0Cu+rAdv0gRIxi2 RWUBkR4e4PBwV1OFScwNCsljjt6bHdUHrtdB3fo5Hzu9cO5hHdL7NEsKB1K2w7rV bgNuNqfj6Y4xUBchAfQO5CCJ9ISci5KoJ4RBpk6EprZR3QN40kN8GPlhi2519K7w F3d8jolDDHlkqxIsqoe47MYOcSepNEadVNsiYKb0rM6doilfxyXiu6dtTFMrC8Vy K2HDCdTyuIgw+TnwqT1puaUwxiIL8DFJf1CVyjwGuQ4UgaIEkHXKIsCssyyJ76Jh QkWX1aeRldbfkVArJWHQWqDQopx9pFBz1gjlws0YjAsU5YijOOXva464P9Rxg+Gq 4pRlgnO48joQam9bRirP2Z6yhqa4O6jkzKDOXSYduAUYD7IMfpsYnz09wKS95jj+ QCrR7VmKnpQdsXg5a/mqyacfIH30ph002VywRxPiFM89Syd25yo= =rUrf -----END PGP SIGNATURE----- Merge tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: - zoned relocation fixes: - fix critical section end for extent writeback, this could lead to out of order write - prevent writing to previous data relocation block group if space gets low - reflink fixes: - fix race between reflinking and ordered extent completion - proper error handling when block reserve migration fails - add missing inode iversion/mtime/ctime updates on each iteration when replacing extents - fix deadlock when running fsync/fiemap/commit at the same time - fix false-positive KCSAN report regarding pid tracking for read locks and data race - minor documentation update and link to new site * tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: Documentation: update btrfs list of features and link to readthedocs.io btrfs: fix deadlock with fsync+fiemap+transaction commit btrfs: don't set lock_owner when locking extent buffer for reading btrfs: zoned: fix critical section of relocation inode writeback btrfs: zoned: prevent allocation from previous data relocation BG btrfs: do not BUG_ON() on failure to migrate space when replacing extents btrfs: add missing inode updates on each iteration when replacing extents btrfs: fix race between reflinking and ordered extent completion
This commit is contained in:
commit
82708bb1eb
|
@ -19,13 +19,23 @@ The main Btrfs features include:
|
|||
* Subvolumes (separate internal filesystem roots)
|
||||
* Object level mirroring and striping
|
||||
* Checksums on data and metadata (multiple algorithms available)
|
||||
* Compression
|
||||
* Compression (multiple algorithms available)
|
||||
* Reflink, deduplication
|
||||
* Scrub (on-line checksum verification)
|
||||
* Hierarchical quota groups (subvolume and snapshot support)
|
||||
* Integrated multiple device support, with several raid algorithms
|
||||
* Offline filesystem check
|
||||
* Efficient incremental backup and FS mirroring
|
||||
* Efficient incremental backup and FS mirroring (send/receive)
|
||||
* Trim/discard
|
||||
* Online filesystem defragmentation
|
||||
* Swapfile support
|
||||
* Zoned mode
|
||||
* Read/write metadata verification
|
||||
* Online resize (shrink, grow)
|
||||
|
||||
For more information please refer to the wiki
|
||||
For more information please refer to the documentation site or wiki
|
||||
|
||||
https://btrfs.readthedocs.io
|
||||
|
||||
https://btrfs.wiki.kernel.org
|
||||
|
||||
|
|
|
@ -104,6 +104,7 @@ struct btrfs_block_group {
|
|||
unsigned int relocating_repair:1;
|
||||
unsigned int chunk_item_inserted:1;
|
||||
unsigned int zone_is_active:1;
|
||||
unsigned int zoned_data_reloc_ongoing:1;
|
||||
|
||||
int disk_cache_state;
|
||||
|
||||
|
|
|
@ -1330,6 +1330,8 @@ struct btrfs_replace_extent_info {
|
|||
* existing extent into a file range.
|
||||
*/
|
||||
bool is_new_extent;
|
||||
/* Indicate if we should update the inode's mtime and ctime. */
|
||||
bool update_times;
|
||||
/* Meaningful only if is_new_extent is true. */
|
||||
int qgroup_reserved;
|
||||
/*
|
||||
|
|
|
@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
|
|||
block_group->start == fs_info->data_reloc_bg ||
|
||||
fs_info->data_reloc_bg == 0);
|
||||
|
||||
if (block_group->ro) {
|
||||
if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
|
||||
ret = 1;
|
||||
goto out;
|
||||
}
|
||||
|
@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
|
|||
out:
|
||||
if (ret && ffe_ctl->for_treelog)
|
||||
fs_info->treelog_bg = 0;
|
||||
if (ret && ffe_ctl->for_data_reloc)
|
||||
if (ret && ffe_ctl->for_data_reloc &&
|
||||
fs_info->data_reloc_bg == block_group->start) {
|
||||
/*
|
||||
* Do not allow further allocations from this block group.
|
||||
* Compared to increasing the ->ro, setting the
|
||||
* ->zoned_data_reloc_ongoing flag still allows nocow
|
||||
* writers to come in. See btrfs_inc_nocow_writers().
|
||||
*
|
||||
* We need to disable an allocation to avoid an allocation of
|
||||
* regular (non-relocation data) extent. With mix of relocation
|
||||
* extents and regular extents, we can dispatch WRITE commands
|
||||
* (for relocation extents) and ZONE APPEND commands (for
|
||||
* regular extents) at the same time to the same zone, which
|
||||
* easily break the write pointer.
|
||||
*/
|
||||
block_group->zoned_data_reloc_ongoing = 1;
|
||||
fs_info->data_reloc_bg = 0;
|
||||
}
|
||||
spin_unlock(&fs_info->relocation_bg_lock);
|
||||
spin_unlock(&fs_info->treelog_bg_lock);
|
||||
spin_unlock(&block_group->lock);
|
||||
|
|
|
@ -5241,13 +5241,14 @@ int extent_writepages(struct address_space *mapping,
|
|||
*/
|
||||
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
|
||||
ret = extent_write_cache_pages(mapping, wbc, &epd);
|
||||
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
|
||||
ASSERT(ret <= 0);
|
||||
if (ret < 0) {
|
||||
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
|
||||
end_write_bio(&epd, ret);
|
||||
return ret;
|
||||
}
|
||||
flush_write_bio(&epd);
|
||||
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
|||
*/
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
|
||||
if (ret != BTRFS_NO_LOG_SYNC) {
|
||||
if (!ret) {
|
||||
ret = btrfs_sync_log(trans, root, &ctx);
|
||||
if (!ret) {
|
||||
ret = btrfs_end_transaction(trans);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (!full_sync) {
|
||||
ret = btrfs_wait_ordered_range(inode, start, len);
|
||||
if (ret) {
|
||||
btrfs_end_transaction(trans);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
ret = btrfs_commit_transaction(trans);
|
||||
} else {
|
||||
if (ret == BTRFS_NO_LOG_SYNC) {
|
||||
ret = btrfs_end_transaction(trans);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* We successfully logged the inode, attempt to sync the log. */
|
||||
if (!ret) {
|
||||
ret = btrfs_sync_log(trans, root, &ctx);
|
||||
if (!ret) {
|
||||
ret = btrfs_end_transaction(trans);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* At this point we need to commit the transaction because we had
|
||||
* btrfs_need_log_full_commit() or some other error.
|
||||
*
|
||||
* If we didn't do a full sync we have to stop the trans handle, wait on
|
||||
* the ordered extents, start it again and commit the transaction. If
|
||||
* we attempt to wait on the ordered extents here we could deadlock with
|
||||
* something like fallocate() that is holding the extent lock trying to
|
||||
* start a transaction while some other thread is trying to commit the
|
||||
* transaction while we (fsync) are currently holding the transaction
|
||||
* open.
|
||||
*/
|
||||
if (!full_sync) {
|
||||
ret = btrfs_end_transaction(trans);
|
||||
if (ret)
|
||||
goto out;
|
||||
ret = btrfs_wait_ordered_range(inode, start, len);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* This is safe to use here because we're only interested in
|
||||
* making sure the transaction that had the ordered extents is
|
||||
* committed. We aren't waiting on anything past this point,
|
||||
* we're purely getting the transaction and committing it.
|
||||
*/
|
||||
trans = btrfs_attach_transaction_barrier(root);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
|
||||
/*
|
||||
* We committed the transaction and there's no currently
|
||||
* running transaction, this means everything we care
|
||||
* about made it to disk and we are done.
|
||||
*/
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = btrfs_commit_transaction(trans);
|
||||
out:
|
||||
ASSERT(list_empty(&ctx.list));
|
||||
err = file_check_and_advance_wb_err(file);
|
||||
|
@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
|
|||
|
||||
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
|
||||
min_size, false);
|
||||
BUG_ON(ret);
|
||||
if (WARN_ON(ret))
|
||||
goto out_trans;
|
||||
trans->block_rsv = rsv;
|
||||
|
||||
cur_offset = start;
|
||||
|
@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
|
|||
extent_info->file_offset += replace_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* We are releasing our handle on the transaction, balance the
|
||||
* dirty pages of the btree inode and flush delayed items, and
|
||||
* then get a new transaction handle, which may now point to a
|
||||
* new transaction in case someone else may have committed the
|
||||
* transaction we used to replace/drop file extent items. So
|
||||
* bump the inode's iversion and update mtime and ctime except
|
||||
* if we are called from a dedupe context. This is because a
|
||||
* power failure/crash may happen after the transaction is
|
||||
* committed and before we finish replacing/dropping all the
|
||||
* file extent items we need.
|
||||
*/
|
||||
inode_inc_iversion(&inode->vfs_inode);
|
||||
|
||||
if (!extent_info || extent_info->update_times) {
|
||||
inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
|
||||
inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
|
||||
}
|
||||
|
||||
ret = btrfs_update_inode(trans, root, inode);
|
||||
if (ret)
|
||||
break;
|
||||
|
@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
|
|||
|
||||
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
|
||||
rsv, min_size, false);
|
||||
BUG_ON(ret); /* shouldn't happen */
|
||||
if (WARN_ON(ret))
|
||||
break;
|
||||
trans->block_rsv = rsv;
|
||||
|
||||
cur_offset = drop_args.drop_end;
|
||||
|
|
|
@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
|
|||
ordered_extent->file_offset,
|
||||
ordered_extent->file_offset +
|
||||
logical_len);
|
||||
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
|
||||
ordered_extent->disk_num_bytes);
|
||||
} else {
|
||||
BUG_ON(root == fs_info->tree_root);
|
||||
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
|
||||
|
@ -9897,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
|
|||
extent_info.file_offset = file_offset;
|
||||
extent_info.extent_buf = (char *)&stack_fi;
|
||||
extent_info.is_new_extent = true;
|
||||
extent_info.update_times = true;
|
||||
extent_info.qgroup_reserved = qgroup_released;
|
||||
extent_info.insertions = 0;
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
|
|||
start_ns = ktime_get_ns();
|
||||
|
||||
down_read_nested(&eb->lock, nest);
|
||||
eb->lock_owner = current->pid;
|
||||
trace_btrfs_tree_read_lock(eb, start_ns);
|
||||
}
|
||||
|
||||
|
@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
|
|||
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
|
||||
{
|
||||
if (down_read_trylock(&eb->lock)) {
|
||||
eb->lock_owner = current->pid;
|
||||
trace_btrfs_try_tree_read_lock(eb);
|
||||
return 1;
|
||||
}
|
||||
|
@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
|
|||
void btrfs_tree_read_unlock(struct extent_buffer *eb)
|
||||
{
|
||||
trace_btrfs_tree_read_unlock(eb);
|
||||
eb->lock_owner = 0;
|
||||
up_read(&eb->lock);
|
||||
}
|
||||
|
||||
|
|
|
@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
|
|||
int ret;
|
||||
const u64 len = olen_aligned;
|
||||
u64 last_dest_end = destoff;
|
||||
u64 prev_extent_end = off;
|
||||
|
||||
ret = -ENOMEM;
|
||||
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
|
||||
|
@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
|
|||
key.offset = off;
|
||||
|
||||
while (1) {
|
||||
u64 next_key_min_offset = key.offset + 1;
|
||||
struct btrfs_file_extent_item *extent;
|
||||
u64 extent_gen;
|
||||
int type;
|
||||
|
@ -431,14 +431,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
|
|||
* The first search might have left us at an extent item that
|
||||
* ends before our target range's start, can happen if we have
|
||||
* holes and NO_HOLES feature enabled.
|
||||
*
|
||||
* Subsequent searches may leave us on a file range we have
|
||||
* processed before - this happens due to a race with ordered
|
||||
* extent completion for a file range that is outside our source
|
||||
* range, but that range was part of a file extent item that
|
||||
* also covered a leading part of our source range.
|
||||
*/
|
||||
if (key.offset + datal <= off) {
|
||||
if (key.offset + datal <= prev_extent_end) {
|
||||
path->slots[0]++;
|
||||
goto process_slot;
|
||||
} else if (key.offset >= off + len) {
|
||||
break;
|
||||
}
|
||||
next_key_min_offset = key.offset + datal;
|
||||
|
||||
prev_extent_end = key.offset + datal;
|
||||
size = btrfs_item_size(leaf, slot);
|
||||
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
|
||||
size);
|
||||
|
@ -489,6 +496,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
|
|||
clone_info.file_offset = new_key.offset;
|
||||
clone_info.extent_buf = buf;
|
||||
clone_info.is_new_extent = false;
|
||||
clone_info.update_times = !no_time_update;
|
||||
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
|
||||
drop_start, new_key.offset + datal - 1,
|
||||
&clone_info, &trans);
|
||||
|
@ -550,7 +558,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
|
|||
break;
|
||||
|
||||
btrfs_release_path(path);
|
||||
key.offset = next_key_min_offset;
|
||||
key.offset = prev_extent_end;
|
||||
|
||||
if (fatal_signal_pending(current)) {
|
||||
ret = -EINTR;
|
||||
|
|
|
@ -2139,3 +2139,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
|
|||
factor = div64_u64(used * 100, total);
|
||||
return factor >= fs_info->bg_reclaim_threshold;
|
||||
}
|
||||
|
||||
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
|
||||
u64 length)
|
||||
{
|
||||
struct btrfs_block_group *block_group;
|
||||
|
||||
if (!btrfs_is_zoned(fs_info))
|
||||
return;
|
||||
|
||||
block_group = btrfs_lookup_block_group(fs_info, logical);
|
||||
/* It should be called on a previous data relocation block group. */
|
||||
ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
|
||||
|
||||
spin_lock(&block_group->lock);
|
||||
if (!block_group->zoned_data_reloc_ongoing)
|
||||
goto out;
|
||||
|
||||
/* All relocation extents are written. */
|
||||
if (block_group->start + block_group->alloc_offset == logical + length) {
|
||||
/* Now, release this block group for further allocations. */
|
||||
block_group->zoned_data_reloc_ongoing = 0;
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&block_group->lock);
|
||||
btrfs_put_block_group(block_group);
|
||||
}
|
||||
|
|
|
@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
|
|||
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
|
||||
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
|
||||
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
|
||||
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
|
||||
u64 length);
|
||||
#else /* CONFIG_BLK_DEV_ZONED */
|
||||
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
|
||||
struct blk_zone *zone)
|
||||
|
@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
|
||||
u64 logical, u64 length) { }
|
||||
#endif
|
||||
|
||||
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
|
||||
|
|
Loading…
Reference in New Issue