From a7176f74fa570c83a48e66435b77651ad8fc5aee Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Sat, 4 Aug 2018 21:10:53 +0800 Subject: [PATCH 01/93] btrfs: simplify the send_in_progress check in btrfs_delete_subvolume Only when send_in_progress, we have to do something different such as btrfs_warn() and return -EPERM. Therefore, we could check send_in_progress first and process error handling, after the root_item_lock has been got. Just for better readability. No functional change. Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ea5339603cf..8b3d719e229d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4270,18 +4270,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) * again is not run concurrently. */ spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - if (dest->send_in_progress == 0) { - btrfs_set_root_flags(&dest->root_item, - root_flags | BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } else { + if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); return -EPERM; } + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); down_write(&fs_info->subvol_sem); From 3a58417486ca99a3bfef40e309f38adb45a5171d Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Sat, 4 Aug 2018 21:10:55 +0800 Subject: [PATCH 02/93] btrfs: switch update_size to bool in btrfs_block_rsv_migrate and btrfs_rsv_add_bytes Using true and false here is closer to the expected semantic than using 0 and 1. No functional change. Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/extent-tree.c | 16 ++++++++-------- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 6 +++--- fs/btrfs/relocation.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..3dd4d0cb0c7e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2771,7 +2771,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - int update_size); + bool update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f51b509f2d9b..584cb103955e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -559,7 +559,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, * reserved space when starting a transaction. So no need to reserve * qgroup space here. */ - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, @@ -647,7 +647,7 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; } - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2d9074295d7f..26616712de13 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5284,7 +5284,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, } static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int update_size) + u64 num_bytes, bool update_size) { spin_lock(&block_rsv->lock); block_rsv->reserved += num_bytes; @@ -5316,7 +5316,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, global_rsv->full = 0; spin_unlock(&global_rsv->lock); - block_rsv_add_bytes(dest, num_bytes, 1); + block_rsv_add_bytes(dest, num_bytes, true); return 0; } @@ -5479,7 +5479,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, struct btrfs_block_rsv *dst, u64 num_bytes, - int update_size) + bool update_size) { int ret; @@ -5540,7 +5540,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); + block_rsv_add_bytes(block_rsv, num_bytes, true); return 0; } @@ -5587,7 +5587,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); + block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; } @@ -5629,7 +5629,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); + block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), num_bytes, 1); @@ -5835,7 +5835,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); @@ -8215,7 +8215,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { - block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_add_bytes(block_rsv, blocksize, false); block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2be00e873e92..d254cf94545f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2544,7 +2544,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); trans->block_rsv = rsv; @@ -2594,7 +2594,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8b3d719e229d..bc3d0e0c43a5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5334,7 +5334,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * it. */ if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) + !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, false)) return trans; /* If not, commit and try again. */ @@ -9020,7 +9020,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); /* @@ -9057,7 +9057,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_block_rsv_release(fs_info, rsv, -1); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8783a1776540..995a28a724ce 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4669,7 +4669,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, - rc->nodes_relocated, 1); + rc->nodes_relocated, true); if (ret) return ret; } From 684572df940181b070760bf027a6ded6b7b55c3d Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Sat, 4 Aug 2018 21:10:57 +0800 Subject: [PATCH 03/93] btrfs: Remove root parameter from btrfs_insert_dir_item All callers pass the root tree of dir, we can push that down to the function itself. Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/dir-item.c | 8 ++++---- fs/btrfs/inode.c | 3 +-- fs/btrfs/ioctl.c | 3 +-- fs/btrfs/transaction.c | 7 +++---- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3dd4d0cb0c7e..df260a9867be 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3021,8 +3021,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a678b07fcf01..8de74d835dba 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, const char *name, int name_len, - struct btrfs_inode *dir, struct btrfs_key *location, - u8 type, u64 index) +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, + int name_len, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; + struct btrfs_root *root = dir->root; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bc3d0e0c43a5..a147e10b12ae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6389,8 +6389,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d60b6caf09e8..1cf4decaee0d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -686,8 +686,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - ret = btrfs_insert_dir_item(trans, root, - name, namelen, BTRFS_I(dir), &key, + ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b84f5015029..bd784d8f5215 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1613,10 +1613,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, parent_root, - dentry->d_name.name, dentry->d_name.len, - BTRFS_I(parent_inode), &key, - BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, dentry->d_name.name, + dentry->d_name.len, BTRFS_I(parent_inode), + &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { From 5a2cb25ab9da5046f10e773c9ecfdb3339f3dd7c Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Sat, 4 Aug 2018 21:10:56 +0800 Subject: [PATCH 04/93] btrfs: remove a useless return statement in btrfs_block_rsv_add Since ret must be 0 here, don't have to return. No functional change and code readability is not hurt. Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 26616712de13..8e4a8581e151 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5539,10 +5539,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return 0; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { + if (!ret) block_rsv_add_bytes(block_rsv, num_bytes, true); - return 0; - } return ret; } From 4fd786e6c3d67b1348e0ad4f450efe9fc9d7a306 Mon Sep 17 00:00:00 2001 From: Misono Tomohiro Date: Mon, 6 Aug 2018 14:25:24 +0900 Subject: [PATCH 05/93] btrfs: Remove 'objectid' member from struct btrfs_root There are two members in struct btrfs_root which indicate root's objectid: objectid and root_key.objectid. They are both set to the same value in __setup_root(): static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { ... root->objectid = objectid; ... root->root_key.objectid = objecitd; ... } and not changed to other value after initialization. grep in btrfs directory shows both are used in many places: $ grep -rI "root->root_key.objectid" | wc -l 133 $ grep -rI "root->objectid" | wc -l 55 (4.17, inc. some noise) It is confusing to have two similar variable names and it seems that there is no rule about which should be used in a certain case. Since ->root_key itself is needed for tree reloc tree, let's remove 'objecitd' member and unify code to use ->root_key.objectid in all places. Signed-off-by: Misono Tomohiro Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 5 +++-- fs/btrfs/btrfs_inode.h | 8 ++++---- fs/btrfs/ctree.c | 2 +- fs/btrfs/ctree.h | 1 - fs/btrfs/delayed-inode.c | 5 +++-- fs/btrfs/disk-io.c | 5 ++--- fs/btrfs/export.c | 4 ++-- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/qgroup.c | 23 ++++++++++++----------- fs/btrfs/ref-verify.c | 8 ++++---- fs/btrfs/relocation.c | 3 ++- fs/btrfs/send.c | 16 ++++++++-------- fs/btrfs/super.c | 6 ++++-- fs/btrfs/transaction.c | 4 ++-- include/trace/events/btrfs.h | 15 ++++++++------- 17 files changed, 58 insertions(+), 53 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ae750b1574a2..84006e3dd105 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1468,7 +1468,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->objectid, + .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, }; @@ -2031,7 +2031,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", - cur, found_key.objectid, fs_root->objectid); + cur, found_key.objectid, + fs_root->root_key.objectid); ret = iterate(parent, name_len, (unsigned long)(iref + 1), eb, ctx); if (ret) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1343ac57b438..97d91e55b70a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) { - u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME); #if BITS_PER_LONG == 32 h = (h >> 32) ^ (h & 0xffffffff); @@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; /* Output minus objectid, which is more meaningful */ - if (root->objectid >= BTRFS_LAST_FREE_OBJECTID) + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(inode), + root->root_key.objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); else btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(inode), + root->root_key.objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d436fb4c002e..1f71695cb0a8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -207,7 +207,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ - if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index df260a9867be..923ac6cb9784 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1202,7 +1202,6 @@ struct btrfs_root { int last_log_commit; pid_t log_start_pid; - u64 objectid; u64 last_trans; u32 type; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 584cb103955e..47ce74cf134a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->objectid, + name_len, name, delayed_node->root->root_key.objectid, delayed_node->inode_id, ret); BUG(); } @@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, node->root->objectid, node->inode_id, ret); + index, node->root->root_key.objectid, + node->inode_id, ret); BUG(); } mutex_unlock(&node->mutex); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 05dc3c17cb62..3611df2ce5c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -125,8 +125,8 @@ struct async_submit_bio { * Different roots are used for different purposes and may nest inside each * other and they require separate keysets. As lockdep keys should be * static, assign keysets according to the purpose of the root as indicated - * by btrfs_root->objectid. This ensures that all special purpose roots - * have separate keysets. + * by btrfs_root->root_key.objectid. This ensures that all special purpose + * roots have separate keysets. * * Lock-nesting across peer nodes is always done with the immediate parent * node locked thus preventing deadlock. As lockdep doesn't know this, use @@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; root->orphan_cleanup_state = 0; - root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; root->nr_delalloc_inodes = 0; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1f3755b3a37a..ddf28ecf17f9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = btrfs_ino(BTRFS_I(inode)); - fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; fid->gen = inode->i_generation; if (parent) { @@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->objectid; + parent_root_id = BTRFS_I(parent)->root->root_key.objectid; if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8e4a8581e151..241d54e30294 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8873,7 +8873,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a147e10b12ae..2a8d4d36335e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6582,7 +6582,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->objectid != BTRFS_I(inode)->root->objectid) + if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1cf4decaee0d..ef8da14391d2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4392,7 +4392,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->objectid)) { + if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; goto out; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d4917c0cddf5..fef30bc1e7b7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3004,7 +3004,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || - !is_fstree(root->objectid) || len == 0) + !is_fstree(root->root_key.objectid) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -3090,7 +3090,7 @@ static int qgroup_free_reserved_data(struct inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); ret = freed; out: @@ -3122,7 +3122,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3215,7 +3215,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid) || num_bytes == 0) + !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -3240,13 +3240,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -3256,7 +3256,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* @@ -3267,7 +3267,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); - btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, + num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -3321,13 +3322,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->objectid, num_bytes); + qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); } /* @@ -3354,7 +3355,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) inode->i_ino, unode->val, unode->aux); } btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e5b9e596bb92..d69fbfb30aa9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, INIT_LIST_HEAD(&ra->list); ra->action = action; - ra->root = root->objectid; + ra->root = root->root_key.objectid; /* * This is an allocation, preallocate the block_entry in case we haven't @@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * one we want to lookup below when we modify the * re->num_refs. */ - ref_root = root->objectid; - re->root_objectid = root->objectid; + ref_root = root->root_key.objectid; + re->root_objectid = root->root_key.objectid; re->num_refs = 0; } @@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * didn't thik of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", - root->objectid, be->bytenr); + root->root_key.objectid, be->bytenr); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ra); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 995a28a724ce..c384b8133407 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -884,7 +884,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, cur->bytenr) { btrfs_err(root->fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, root->objectid, + cur->bytenr, level - 1, + root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); err = -ENOENT; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ba8950bfd9c7..bd5756504709 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) u64 root = (u64)(uintptr_t)key; struct clone_root *cr = (struct clone_root *)elt; - if (root < cr->root->objectid) + if (root < cr->root->root_key.objectid) return -1; - if (root > cr->root->objectid) + if (root > cr->root->root_key.objectid) return 1; return 0; } @@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) struct clone_root *cr1 = (struct clone_root *)e1; struct clone_root *cr2 = (struct clone_root *)e2; - if (cr1->root->objectid < cr2->root->objectid) + if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; - if (cr1->root->objectid > cr2->root->objectid) + if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) return 1; return 0; } @@ -2346,7 +2346,7 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; } - key.objectid = send_root->objectid; + key.objectid = send_root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2362,7 +2362,7 @@ static int send_subvol_begin(struct send_ctx *sctx) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->objectid) { + key.objectid != send_root->root_key.objectid) { ret = -ENOENT; goto out; } @@ -4907,8 +4907,8 @@ static int send_clone(struct send_ctx *sctx, btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->objectid, clone_root->ino, - clone_root->offset); + offset, len, clone_root->root->root_key.objectid, + clone_root->ino, clone_root->offset); p = fs_path_alloc(); if (!p) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6601c9aa5e35..b362b45dd757 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; + buf->f_fsid.val[0] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; + buf->f_fsid.val[1] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid; return 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bd784d8f5215..e7856e15adbf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -118,7 +118,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - if (is_fstree(root->objectid)) + if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); } @@ -2329,7 +2329,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - btrfs_debug(fs_info, "cleaner removing %llu", root->objectid); + btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b401c4e36394..abe3ff774f58 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -316,7 +316,7 @@ DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, ), TP_fast_assign_btrfs(bi->root->fs_info, - __entry->root_obj = bi->root->objectid; + __entry->root_obj = bi->root->root_key.objectid; __entry->ino = btrfs_ino(bi); __entry->isize = bi->vfs_inode.i_size; __entry->disk_isize = bi->disk_i_size; @@ -367,7 +367,7 @@ DECLARE_EVENT_CLASS( TP_fast_assign_btrfs( bi->root->fs_info, - __entry->root_obj = bi->root->objectid; + __entry->root_obj = bi->root->root_key.objectid; __entry->ino = btrfs_ino(bi); __entry->isize = bi->vfs_inode.i_size; __entry->disk_isize = bi->disk_i_size; @@ -1477,7 +1477,8 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, ), TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), - __entry->rootid = BTRFS_I(inode)->root->objectid; + __entry->rootid = + BTRFS_I(inode)->root->root_key.objectid; __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->start = start; __entry->len = len; @@ -1675,7 +1676,7 @@ TRACE_EVENT(qgroup_meta_reserve, ), TP_fast_assign_btrfs(root->fs_info, - __entry->refroot = root->objectid; + __entry->refroot = root->root_key.objectid; __entry->diff = diff; ), @@ -1697,7 +1698,7 @@ TRACE_EVENT(qgroup_meta_convert, ), TP_fast_assign_btrfs(root->fs_info, - __entry->refroot = root->objectid; + __entry->refroot = root->root_key.objectid; __entry->diff = diff; ), @@ -1721,7 +1722,7 @@ TRACE_EVENT(qgroup_meta_free_all_pertrans, ), TP_fast_assign_btrfs(root->fs_info, - __entry->refroot = root->objectid; + __entry->refroot = root->root_key.objectid; spin_lock(&root->qgroup_meta_rsv_lock); __entry->diff = -(s64)root->qgroup_meta_rsv_pertrans; spin_unlock(&root->qgroup_meta_rsv_lock); @@ -1802,7 +1803,7 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents, ), TP_fast_assign_btrfs(root->fs_info, - __entry->root_objectid = root->objectid; + __entry->root_objectid = root->root_key.objectid; __entry->ino = ino; __entry->mod = mod; ), From 4b6f8e9695da65e29f9f8ee84b39e0ba5b45e8e9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Aug 2018 10:46:53 +0800 Subject: [PATCH 06/93] Btrfs: do not unnecessarily pass write_lock_level when processing leaf As we're going to return right after the call, it's not necessary to get update the new write_lock_level from unlock_up. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1f71695cb0a8..1124d236291d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2910,7 +2910,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } if (!p->search_for_split) unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); + min_write_lock_level, NULL); goto done; } } From f8b00e0f06e54efed0dc919518841ad0dd2199cd Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Mon, 13 Aug 2018 14:06:08 +0800 Subject: [PATCH 07/93] btrfs: remove unneeded NULL checks before kfree Kfree has taken the NULL pointer into account. So remove the check before kfree. The issue is detected with the help of Coccinelle. Signed-off-by: zhong jiang Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0adf38b00fa0..ed097ff023e8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2110,8 +2110,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, out: if (info) { - if (info->bitmap) - kfree(info->bitmap); + kfree(info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } @@ -3601,8 +3600,7 @@ int test_add_free_space_entry(struct btrfs_block_group_cache *cache, if (info) kmem_cache_free(btrfs_free_space_cachep, info); - if (map) - kfree(map); + kfree(map); return 0; } From 16220c467ad34c8f103566a79ac047ed391f51f2 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 10 Aug 2018 13:53:20 +0800 Subject: [PATCH 08/93] btrfs: add assertions where number of devices could go below 0 In preparation to add helper function to deduce the num_devices with replace running, use assert instead of BUG_ON or WARN_ON. The number of devices would not normally drop to 0 due to other checks so the assert is sufficient. Signed-off-by: Anand Jain Reviewed-by: David Sterba [ update changelog, adjust the assert condition ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4405e430da6..3946fe951b60 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1868,7 +1868,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, num_devices = fs_devices->num_devices; btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - WARN_ON(num_devices < 1); + ASSERT(num_devices > 1); num_devices--; } btrfs_dev_replace_read_unlock(&fs_info->dev_replace); @@ -3743,7 +3743,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, num_devices = fs_info->fs_devices->num_devices; btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - BUG_ON(num_devices < 1); + ASSERT(num_devices > 1); num_devices--; } btrfs_dev_replace_read_unlock(&fs_info->dev_replace); From 1da739678e0b7a764a0f1ef46886f919f19f6446 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 10 Aug 2018 13:53:21 +0800 Subject: [PATCH 09/93] btrfs: add helper to obtain number of devices with ongoing dev-replace When the replace is running the fs_devices::num_devices also includes the replaced device, however in some operations like device delete and balance it needs the actual num_devices without the repalced devices. The function btrfs_num_devices() just provides that. And here is a scenario how balance and repalce items could co-exist: Consider balance is started and paused, now start the replace followed by a unmount or system power-cycle. During following mount, the open_ctree() first restarts the balance so it must check for the device replace otherwise our num_devices calculation will be wrong. Signed-off-by: Anand Jain Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3946fe951b60..ea9845a5e601 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device, fs_info->fs_devices->latest_bdev = next_device->bdev; } +/* + * Return btrfs_fs_devices::num_devices excluding the device that's being + * currently replaced. + */ +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) +{ + u64 num_devices = fs_info->fs_devices->num_devices; + + btrfs_dev_replace_read_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + ASSERT(num_devices > 1); + num_devices--; + } + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + + return num_devices; +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid) { @@ -1865,13 +1883,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_lock(&uuid_mutex); - num_devices = fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - ASSERT(num_devices > 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) @@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } } - num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - ASSERT(num_devices > 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + num_devices = btrfs_num_devices(fs_info); + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); From 32934280967d00dc2b5c4d3b63b21a9c8638326e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 14 Aug 2018 11:09:52 -0700 Subject: [PATCH 10/93] Btrfs: clean up scrub is_dev_replace parameter struct scrub_ctx has an ->is_dev_replace member, so there's no point in passing around is_dev_replace where sctx is available. Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3be1456b5116..4bcc275f7612 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3022,8 +3022,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length, - int is_dev_replace) + int num, u64 base, u64 length) { struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -3299,7 +3298,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, extent_physical = extent_logical - logical + physical; extent_dev = scrub_dev; extent_mirror_num = mirror_num; - if (is_dev_replace) + if (sctx->is_dev_replace) scrub_remap_extent(fs_info, extent_logical, extent_len, &extent_physical, &extent_dev, @@ -3397,8 +3396,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group_cache *cache, - int is_dev_replace) + struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; @@ -3435,8 +3433,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length, - is_dev_replace); + chunk_offset, length); if (ret) goto out; } @@ -3449,8 +3446,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, - struct btrfs_device *scrub_dev, u64 start, u64 end, - int is_dev_replace) + struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; @@ -3544,7 +3540,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(cache); - if (!ret && is_dev_replace) { + if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks * that started dellaloc right before we set the block @@ -3609,7 +3605,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; btrfs_dev_replace_write_unlock(&fs_info->dev_replace); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, cache, is_dev_replace); + found_key.offset, cache); /* * flush, submit all pending read and write bios, afterwards @@ -3670,7 +3666,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (is_dev_replace && + if (sctx->is_dev_replace && atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; @@ -3893,8 +3889,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } if (!ret) - ret = scrub_enumerate_chunks(sctx, dev, start, end, - is_dev_replace); + ret = scrub_enumerate_chunks(sctx, dev, start, end); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); From 9c7b0c2e8dbfbcd80a71e2cbfe02704f26c185c6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 10 Aug 2018 10:20:26 +0800 Subject: [PATCH 11/93] btrfs: qgroup: Dirty all qgroups before rescan [BUG] In the following case, rescan won't zero out the number of qgroup 1/0: $ mkfs.btrfs -fq $DEV $ mount $DEV /mnt $ btrfs quota enable /mnt $ btrfs qgroup create 1/0 /mnt $ btrfs sub create /mnt/sub $ btrfs qgroup assign 0/257 1/0 /mnt $ dd if=/dev/urandom of=/mnt/sub/file bs=1k count=1000 $ btrfs sub snap /mnt/sub /mnt/snap $ btrfs quota rescan -w /mnt $ btrfs qgroup show -pcre /mnt qgroupid rfer excl max_rfer max_excl parent child -------- ---- ---- -------- -------- ------ ----- 0/5 16.00KiB 16.00KiB none none --- --- 0/257 1016.00KiB 16.00KiB none none 1/0 --- 0/258 1016.00KiB 16.00KiB none none --- --- 1/0 1016.00KiB 16.00KiB none none --- 0/257 So far so good, but: $ btrfs qgroup remove 0/257 1/0 /mnt WARNING: quotas may be inconsistent, rescan needed $ btrfs quota rescan -w /mnt $ btrfs qgroup show -pcre /mnt qgoupid rfer excl max_rfer max_excl parent child -------- ---- ---- -------- -------- ------ ----- 0/5 16.00KiB 16.00KiB none none --- --- 0/257 1016.00KiB 16.00KiB none none --- --- 0/258 1016.00KiB 16.00KiB none none --- --- 1/0 1016.00KiB 16.00KiB none none --- --- ^^^^^^^^^^ ^^^^^^^^ not cleared [CAUSE] Before rescan we call qgroup_rescan_zero_tracking() to zero out all qgroups' accounting numbers. However we don't mark all qgroups dirty, but rely on rescan to do so. If we have any high level qgroup without children, it won't be marked dirty during rescan, since we cannot reach that qgroup. This will cause QGROUP_INFO items of childless qgroups never get updated in the quota tree, thus their numbers will stay the same in "btrfs qgroup show" output. [FIX] Just mark all qgroups dirty in qgroup_rescan_zero_tracking(), so even if we have childless qgroups, their QGROUP_INFO items will still get updated during rescan. Reported-by: Misono Tomohiro CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: Misono Tomohiro Tested-by: Misono Tomohiro Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fef30bc1e7b7..bdd8c0da6e32 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2897,6 +2897,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; + qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } From bee6ec822a6a686d1b8ac65298b07765a48aefad Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 17 Aug 2018 05:05:28 +0800 Subject: [PATCH 12/93] Btrfs: remove always true if branch in btrfs_get_extent @path is always NULL when it comes to the if branch. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2a8d4d36335e..97ccc774f13d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6821,19 +6821,15 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->len = (u64)-1; em->block_len = (u64)-1; + path = btrfs_alloc_path(); if (!path) { - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - /* - * Chances are we'll be called again, so go ahead and do - * readahead - */ - path->reada = READA_FORWARD; + err = -ENOMEM; + goto out; } + /* Chances are we'll be called again, so go ahead and do readahead */ + path->reada = READA_FORWARD; + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { err = ret; From 556f3ca88ecb7087e7e3144e429414c60a300984 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 17 Aug 2018 00:37:14 +0800 Subject: [PATCH 13/93] btrfs: change btrfs_free_reserved_bytes to return void btrfs_free_reserved_bytes uses the variable "ret" for return value, but it is not modified after initialzation. Further, I find that any of the callers do not handle the return value, so it is safe to drop the unneeded "ret" and return void. There are no callees that would need the function to handle or pass the value either. Signed-off-by: zhong jiang Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 241d54e30294..67b9f67945b3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6422,11 +6422,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, * reserve set to 0 in order to clear the reservation. */ -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc) +static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; - int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); @@ -6439,7 +6438,6 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - return ret; } void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { From 45128b08f74131753a10a584e3e5c4496b279269 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 17 Aug 2018 00:37:15 +0800 Subject: [PATCH 14/93] btrfs: change btrfs_pin_log_trans to return void btrfs_pin_log_trans defines the variable "ret" for return value, but it is not modified after initialization. Further, I find that none of the callers do handles the return value, so it is safe to drop the unneeded "ret" and make it return void. Signed-off-by: zhong jiang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +---- fs/btrfs/tree-log.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3c2ae0e4f25a..1078c65128c5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root) * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ -int btrfs_pin_log_trans(struct btrfs_root *root) +void btrfs_pin_log_trans(struct btrfs_root *root) { - int ret = -ENOENT; - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return ret; } /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7ab9bb88a639..767765031e59 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -65,7 +65,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); -int btrfs_pin_log_trans(struct btrfs_root *root); +void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, int for_rename); From 65c6e82becec33731f48786e5a30f98662c86b16 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 21 Aug 2018 09:42:03 +0800 Subject: [PATCH 15/93] btrfs: Handle owner mismatch gracefully when walking up tree [BUG] When mounting certain crafted image, btrfs will trigger kernel BUG_ON() when trying to recover balance: kernel BUG at fs/btrfs/extent-tree.c:8956! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 662 Comm: mount Not tainted 4.18.0-rc1-custom+ #10 RIP: 0010:walk_up_proc+0x336/0x480 [btrfs] RSP: 0018:ffffb53540c9b890 EFLAGS: 00010202 Call Trace: walk_up_tree+0x172/0x1f0 [btrfs] btrfs_drop_snapshot+0x3a4/0x830 [btrfs] merge_reloc_roots+0xe1/0x1d0 [btrfs] btrfs_recover_relocation+0x3ea/0x420 [btrfs] open_ctree+0x1af3/0x1dd0 [btrfs] btrfs_mount_root+0x66b/0x740 [btrfs] mount_fs+0x3b/0x16a vfs_kern_mount.part.9+0x54/0x140 btrfs_mount+0x16d/0x890 [btrfs] mount_fs+0x3b/0x16a vfs_kern_mount.part.9+0x54/0x140 do_mount+0x1fd/0xda0 ksys_mount+0xba/0xd0 __x64_sys_mount+0x21/0x30 do_syscall_64+0x60/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe [CAUSE] Extent tree corruption. In this particular case, reloc tree root's owner is DATA_RELOC_TREE (should be TREE_RELOC), thus its backref is corrupted and we failed the owner check in walk_up_tree(). [FIX] It's pretty hard to take care of every extent tree corruption, but at least we can remove such BUG_ON() and exit more gracefully. And since in this particular image, DATA_RELOC_TREE and TREE_RELOC share the same root (which is obviously invalid), we needs to make __del_reloc_root() more robust to detect such invalid sharing to avoid possible NULL dereference as root->node can be NULL in this case. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200411 Reported-by: Xu Wen CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 18 ++++++++++++------ fs/btrfs/relocation.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 67b9f67945b3..33ab7dd58774 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8759,15 +8759,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(eb)); + else if (root->root_key.objectid != btrfs_header_owner(eb)) + goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])); + else if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])) + goto owner_mismatch; } btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); @@ -8775,6 +8774,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, wc->refs[level] = 0; wc->flags[level] = 0; return 0; + +owner_mismatch: + btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", + btrfs_header_owner(eb), root->root_key.objectid); + return -EUCLEAN; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -8828,6 +8832,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, ret = walk_up_proc(trans, root, path, wc); if (ret > 0) return 0; + if (ret < 0) + return ret; if (path->locks[level]) { btrfs_tree_unlock_rw(path->nodes[level], diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c384b8133407..87afd3c395bb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1282,7 +1282,7 @@ static void __del_reloc_root(struct btrfs_root *root) struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; - if (rc) { + if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, root->node->start); From b72c3aba09a53fc7c1824250d71180ca154517a7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 21 Aug 2018 09:53:47 +0800 Subject: [PATCH 16/93] btrfs: locking: Add extra check in btrfs_init_new_buffer() to avoid deadlock [BUG] For certain crafted image, whose csum root leaf has missing backref, if we try to trigger write with data csum, it could cause deadlock with the following kernel WARN_ON(): WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400 CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8 Workqueue: btrfs-endio-write btrfs_endio_write_helper RIP: 0010:btrfs_tree_lock+0x3e2/0x400 Call Trace: btrfs_alloc_tree_block+0x39f/0x770 __btrfs_cow_block+0x285/0x9e0 btrfs_cow_block+0x191/0x2e0 btrfs_search_slot+0x492/0x1160 btrfs_lookup_csum+0xec/0x280 btrfs_csum_file_blocks+0x2be/0xa60 add_pending_csums+0xaf/0xf0 btrfs_finish_ordered_io+0x74b/0xc90 finish_ordered_fn+0x15/0x20 normal_work_helper+0xf6/0x500 btrfs_endio_write_helper+0x12/0x20 process_one_work+0x302/0x770 worker_thread+0x81/0x6d0 kthread+0x180/0x1d0 ret_from_fork+0x35/0x40 [CAUSE] That crafted image has missing backref for csum tree root leaf. And when we try to allocate new tree block, since there is no EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot and use it. The extent tree of the image looks like: Normal image | This fuzzed image ----------------------------------+-------------------------------- BG 29360128 | BG 29360128 One empty slot | One empty slot 29364224: backref to UUID tree | 29364224: backref to UUID tree Two empty slots | Two empty slots 29376512: backref to CSUM tree | One empty slot (bad type) <<< 29380608: backref to D_RELOC tree | 29380608: backref to D_RELOC tree ... | ... Since bytenr 29376512 has no METADATA/EXTENT_ITEM, when btrfs try to alloc tree block, it's an valid slot for btrfs. And for finish_ordered_write, when we need to insert csum, we try to CoW csum tree root. By accident, empty slots at bytenr BG_OFFSET, BG_OFFSET + 8K, BG_OFFSET + 12K is already used by tree block COW for other trees, the next empty slot is BG_OFFSET + 16K, which should be the backref for CSUM tree. But due to the bad type, btrfs can recognize it and still consider it as an empty slot, and will try to use it for csum tree CoW. Then in the following call trace, we will try to lock the new tree block, which turns out to be the old csum tree root which is already locked: btrfs_search_slot() called on csum tree root, which is at 29376512 |- btrfs_cow_block() |- btrfs_set_lock_block() | |- Now locks tree block 29376512 (old csum tree root) |- __btrfs_cow_block() |- btrfs_alloc_tree_block() |- btrfs_reserve_extent() | Now it returns tree block 29376512, which extent tree | shows its empty slot, but it's already hold by csum tree |- btrfs_init_new_buffer() |- btrfs_tree_lock() | Triggers WARN_ON(eb->lock_owner == current->pid) |- wait_event() Wait lock owner to release the lock, but it's locked by ourself, so it will deadlock [FIX] This patch will do the lock_owner and current->pid check at btrfs_init_new_buffer(). So above deadlock can be avoided. Since such problem can only happen in crafted image, we will still trigger kernel warning for later aborted transaction, but with a little more meaningful warning message. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405 Reported-by: Xu Wen CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 33ab7dd58774..30c5bb3cb8fb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8115,6 +8115,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; + /* + * Extra safety check in case the extent tree is corrupted and extent + * allocator chooses to use a tree block which is already used and + * locked. + */ + if (buf->lock_owner == current->pid) { + btrfs_err_rl(fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + buf->start, btrfs_header_owner(buf), current->pid); + free_extent_buffer(buf); + return ERR_PTR(-EUCLEAN); + } + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(fs_info, buf); From 9688e9a99e4b2d60d78fbaa270e7e0c70d90f6c2 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 03:14:53 +0800 Subject: [PATCH 17/93] Btrfs: use next_state in find_first_extent_bit As next_state() is already defined to get the next state, use it in find_first_extent_bit. No functional changes. Signed-off-by: Liu Bo Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dd6faab02bb..43bf4ec891c6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state) { struct extent_state *state; - struct rb_node *n; int ret = 1; spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; if (state->end == start - 1 && extent_state_in_tree(state)) { - n = rb_next(&state->rb_node); - while (n) { - state = rb_entry(n, struct extent_state, - rb_node); + while ((state = next_state(state)) != NULL) { if (state->state & bits) goto got_it; - n = rb_next(n); } free_extent_state(*cached_state); *cached_state = NULL; From c64142807f5a4dcff5c69bfb0868df844b2e87c7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 07:36:17 +0800 Subject: [PATCH 18/93] btrfs: free path at an earlier point in btrfs_get_extent trace_btrfs_get_extent() has nothing to do with path, place btrfs_free_path ahead so that we can unlock path on error. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97ccc774f13d..f2ce88d65f90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6979,10 +6979,10 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: + btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - btrfs_free_path(path); if (err) { free_extent_map(em); return ERR_PTR(err); From 6aadd9eb74877a004687a5f70ec20797e5b03c5a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 5 Sep 2018 09:55:27 +0800 Subject: [PATCH 19/93] Btrfs: remove confusing tracepoint in btrfs_add_reserved_bytes Here we're not releasing any space, but transferring bytes from ->bytes_may_use to ->bytes_reserved. The last change to the code in commit 18513091af9483ba8 ("btrfs: update btrfs_space_info's bytes_may_use timely") removed a conditional tracepoint and the logic changed too but the tracepiont remained. Signed-off-by: Liu Bo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 30c5bb3cb8fb..9292e7c0ee81 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6397,10 +6397,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - ram_bytes, 0); space_info->bytes_may_use -= ram_bytes; if (delalloc) cache->delalloc_bytes += num_bytes; From 28c4a3e21ad030d7571ee9b1b246a5cbfd886627 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Wed, 5 Sep 2018 11:07:33 +0800 Subject: [PATCH 20/93] btrfs: defrag: use btrfs_mod_outstanding_extents in cluster_pages_for_defrag Since commit 8b62f87bad9c ("Btrfs: rework outstanding_extents"), manual operations of outstanding_extent in btrfs_inode are replaced by btrfs_mod_outstanding_extents(). The one in cluster_pages_for_defrag seems to be lost, so replace it of btrfs_mod_outstanding_extents(). Fixes: 8b62f87bad9c ("Btrfs: rework outstanding_extents") Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ef8da14391d2..4905d13dee0a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1323,7 +1323,7 @@ static int cluster_pages_for_defrag(struct inode *inode, if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, From d005dbeca0814551ed243fc2d51480d0cd925d67 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Aug 2018 11:45:21 +0100 Subject: [PATCH 21/93] btrfs: remove unused pointer inode in relink_file_extents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pointer inode is being assigned but is never used hence it is redundant and can be removed. It's been unused since the introduction in 38c227d87c49a ("Btrfs: snapshot-aware defrag"). Cleans up clang warning: variable ‘inode’ set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f2ce88d65f90..56e523774049 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2750,12 +2750,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) struct btrfs_path *path; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; - struct inode *inode; struct rb_node *node; int ret; - inode = new->inode; - path = btrfs_alloc_path(); if (!path) return; From 29c5e5d4966d8d56e18e008da01108167d8d6fd1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Aug 2018 14:21:01 +0100 Subject: [PATCH 22/93] btrfs: remove unused pointer 'tree' in btrfs_submit_compressed_read Pointer 'tree' is being assigned but is never used hence it is redundant and can be removed. This is a leftover from cleanup patch 00032d38eaa89c76de7 ("btrfs: drop extent_io_ops::merge_bio_hook callback"). Cleans up clang warning: warning: variable 'tree' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9bfa66592aa7..8703ce68fe9d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -528,7 +528,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree; struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned long compressed_len; @@ -545,7 +544,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int faili = 0; u32 *sums; - tree = &BTRFS_I(inode)->io_tree; em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ From de2c6615dcddf2af868c5cbd1db2e9e73b4beb58 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sat, 25 Aug 2018 13:47:59 +0800 Subject: [PATCH 23/93] Btrfs: fix alignment in declaration and prototype of btrfs_get_extent This fixes btrfs_get_extent to be consistent with our existing declaration style. Note: For the record, indentation styles that are accepted are both, aligning under the opening ( and tab or double tab indentation on the next line. Preferrably not spliting the type or long expressions in the argument lists. Signed-off-by: Liu Bo [ add note ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/inode.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 923ac6cb9784..441610de9908 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3178,8 +3178,8 @@ void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + struct page *page, size_t pg_offset, + u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 56e523774049..80e92c4448bb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6772,9 +6772,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * This also copies inline extents directly into the page. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; From e49aabd973fd0b3e8e949bd9e7e05acbb9008244 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sat, 25 Aug 2018 13:47:09 +0800 Subject: [PATCH 24/93] Btrfs: set leave_spinning in btrfs_get_extent Unless it's going to read inline extents from btree leaf to page, btrfs_get_extent won't sleep during the period of holding path lock. This sets leave_spinning at first and sets path to blocking mode right before reading inline extent if that's the case. The benefit is that a path in spinning mode typically has lower impact (faster) on waiters rather than that in the blocking mode. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 80e92c4448bb..e8a954c47f65 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6827,6 +6827,12 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, /* Chances are we'll be called again, so go ahead and do readahead */ path->reada = READA_FORWARD; + /* + * Unless we're going to uncompress the inline extent, no sleep would + * happen. + */ + path->leave_spinning = 1; + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { err = ret; @@ -6929,6 +6935,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; + + btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { From 380fd06640a7afe972e45310012c3e22d9c4783d Mon Sep 17 00:00:00 2001 From: Misono Tomohiro Date: Thu, 30 Aug 2018 10:59:16 +0900 Subject: [PATCH 25/93] btrfs: remove redundant variable from btrfs_cross_ref_exist Since commit d7df2c796d7e ("Btrfs attach delayed ref updates to delayed ref heads"), check_delayed_ref() won't return -ENOENT. In btrfs_cross_ref_exist(), two variables 'ret' and 'ret2' are originally used to handle -ENOENT error case. Since the code is not needed anymore, let's just remove 'ret2'. Signed-off-by: Misono Tomohiro Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9292e7c0ee81..528f3308c32b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3139,7 +3139,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, { struct btrfs_path *path; int ret; - int ret2; path = btrfs_alloc_path(); if (!path) @@ -3151,17 +3150,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, if (ret && ret != -ENOENT) goto out; - ret2 = check_delayed_ref(root, path, objectid, - offset, bytenr); - } while (ret2 == -EAGAIN); + ret = check_delayed_ref(root, path, objectid, offset, bytenr); + } while (ret == -EAGAIN); - if (ret2 && ret2 != -ENOENT) { - ret = ret2; - goto out; - } - - if (ret != -ENOENT || ret2 != -ENOENT) - ret = 0; out: btrfs_free_path(path); if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) From 374b0e2d6ba5da7fd1cadb3247731ff27d011f6f Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 16:59:33 -0400 Subject: [PATCH 26/93] btrfs: fix error handling in free_log_tree When we hit an I/O error in free_log_tree->walk_log_tree during file system shutdown we can crash due to there not being a valid transaction handle. Use btrfs_handle_fs_error when there's no transaction handle to use. BUG: unable to handle kernel NULL pointer dereference at 0000000000000060 IP: free_log_tree+0xd2/0x140 [btrfs] PGD 0 P4D 0 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC PTI Modules linked in: CPU: 2 PID: 23544 Comm: umount Tainted: G W 4.12.14-kvmsmall #9 SLE15 (unreleased) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 task: ffff96bfd3478880 task.stack: ffffa7cf40d78000 RIP: 0010:free_log_tree+0xd2/0x140 [btrfs] RSP: 0018:ffffa7cf40d7bd10 EFLAGS: 00010282 RAX: 00000000fffffffb RBX: 00000000fffffffb RCX: 0000000000000002 RDX: 0000000000000000 RSI: ffff96c02f07d4c8 RDI: 0000000000000282 RBP: ffff96c013cf1000 R08: ffff96c02f07d4c8 R09: ffff96c02f07d4d0 R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000 R13: ffff96c005e800c0 R14: ffffa7cf40d7bdb8 R15: 0000000000000000 FS: 00007f17856bcfc0(0000) GS:ffff96c03f600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000060 CR3: 0000000045ed6002 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? wait_for_writer+0xb0/0xb0 [btrfs] btrfs_free_log+0x17/0x30 [btrfs] btrfs_drop_and_free_fs_root+0x9a/0xe0 [btrfs] btrfs_free_fs_roots+0xc0/0x130 [btrfs] ? wait_for_completion+0xf2/0x100 close_ctree+0xea/0x2e0 [btrfs] ? kthread_stop+0x161/0x260 generic_shutdown_super+0x6c/0x120 kill_anon_super+0xe/0x20 btrfs_kill_super+0x13/0x100 [btrfs] deactivate_locked_super+0x3f/0x70 cleanup_mnt+0x3b/0x70 task_work_run+0x78/0x90 exit_to_usermode_loop+0x77/0xa6 do_syscall_64+0x1c5/0x1e0 entry_SYSCALL_64_after_hwframe+0x42/0xb7 RIP: 0033:0x7f1784f90827 RSP: 002b:00007ffdeeb03118 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 0000556a60c62970 RCX: 00007f1784f90827 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556a60c62b50 RBP: 0000000000000000 R08: 0000000000000005 R09: 00000000ffffffff R10: 0000556a60c63900 R11: 0000000000000246 R12: 0000556a60c62b50 R13: 00007f17854a81c4 R14: 0000000000000000 R15: 0000000000000000 RIP: free_log_tree+0xd2/0x140 [btrfs] RSP: ffffa7cf40d7bd10 CR2: 0000000000000060 Fixes: 681ae50917df9 ("Btrfs: cleanup reserved space when freeing tree log on error") CC: # v3.13 Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1078c65128c5..c7914f293cb6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3193,9 +3193,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans, }; ret = walk_log_tree(trans, log, &wc); - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, ret); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, From b444ad46b2db4f8c682c99ffb994e532a4713012 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Sep 2018 12:46:12 +0300 Subject: [PATCH 27/93] btrfs: Make btrfs_find_device_by_path return struct btrfs_device Currently this function returns an error code as well as uses one of its arguments as a return value for struct btrfs_device. Change the function so that it returns btrfs_device directly and use the usual "encode error in pointer" mechanics if something goes wrong. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ea9845a5e601..3e21e07d3f03 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2108,9 +2108,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) call_rcu(&tgtdev->rcu, free_device_rcu); } -static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { int ret = 0; struct btrfs_super_block *disk_super; @@ -2118,21 +2117,21 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, u8 *dev_uuid; struct block_device *bdev; struct buffer_head *bh; + struct btrfs_device *device; - *device = NULL; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, fs_info->bdev_holder, 0, &bdev, &bh); if (ret) - return ret; + return ERR_PTR(ret); disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); - if (!*device) - ret = -ENOENT; + if (!device) + device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return ret; + return device; } int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, @@ -2155,11 +2154,13 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, if (!*device) return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - - return 0; } else { - return btrfs_find_device_by_path(fs_info, device_path, device); + *device = btrfs_find_device_by_path(fs_info, device_path); + if (IS_ERR(*device)) + return PTR_ERR(*device); } + + return 0; } /* From 6c05040702e7793e8ee63bb8384ac2f9235c926e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Sep 2018 12:46:13 +0300 Subject: [PATCH 28/93] btrfs: Make btrfs_find_device_missing_or_by_path return directly a device This function returns a numeric error value and additionally the device found in one of its input parameters. Simplify this by making the function directly return a pointer to btrfs_device. Additionally adjust the caller to handle the case when we want to remove the 'missing' device and ENOENT is returned to return the expected positive error value, parsed by progs. Finally, unexport the function since it's not called outside of volume.c. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 32 +++++++++++++++++--------------- fs/btrfs/volumes.h | 3 --- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e21e07d3f03..b39e4bd6457f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2134,11 +2134,10 @@ static struct btrfs_device *btrfs_find_device_by_path( return device; } -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_missing_or_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { - *device = NULL; + struct btrfs_device *device = NULL; if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2147,20 +2146,18 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, list_for_each_entry(tmp, devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tmp->dev_state) && !tmp->bdev) { - *device = tmp; + device = tmp; break; } } - if (!*device) - return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + if (!device) + return ERR_PTR(-ENOENT); } else { - *device = btrfs_find_device_by_path(fs_info, device_path); - if (IS_ERR(*device)) - return PTR_ERR(*device); + device = btrfs_find_device_by_path(fs_info, device_path); } - return 0; + return device; } /* @@ -2170,10 +2167,9 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, const char *devpath, struct btrfs_device **device) { - int ret; + int ret = 0; if (devid) { - ret = 0; *device = btrfs_find_device(fs_info, devid, NULL, NULL); if (!*device) ret = -ENOENT; @@ -2181,8 +2177,14 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, if (!devpath || !devpath[0]) return -EINVAL; - ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, - device); + *device = btrfs_find_device_missing_or_by_path(fs_info, devpath); + if (IS_ERR(*device)) { + if (PTR_ERR(*device) == -ENOENT && + strcmp(devpath, "missing") == 0) + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + else + ret = PTR_ERR(*device); + } } return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 23e9285d88de..e7811473024d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -410,9 +410,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device); int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, const char *devpath, struct btrfs_device **device); From a27a94c2b0c727517c17cf2ca3a9f7291caadfbc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Sep 2018 12:46:14 +0300 Subject: [PATCH 29/93] btrfs: Make btrfs_find_device_by_devspec return btrfs_device directly Instead of returning an error value and using one of the parameters for returning the actual object we are interested in just refactor the function to directly return btrfs_device *. Also bubble up the error handling for the special BTRFS_ERROR_DEV_MISSING_NOT_FOUND value into btrfs_rm_device. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 ++++---- fs/btrfs/volumes.c | 39 ++++++++++++++++++--------------------- fs/btrfs/volumes.h | 6 +++--- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dec01970d8c5..4e2b67d06305 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -409,10 +409,10 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - ret = btrfs_find_device_by_devspec(fs_info, srcdevid, - srcdev_name, &src_device); - if (ret) - return ret; + src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, + srcdev_name); + if (IS_ERR(src_device)) + return PTR_ERR(src_device); ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b39e4bd6457f..86c2330ac83f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1889,10 +1889,16 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto out; - ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, - &device); - if (ret) + device = btrfs_find_device_by_devspec(fs_info, devid, device_path); + + if (IS_ERR(device)) { + if (PTR_ERR(device) == -ENOENT && + strcmp(device_path, "missing") == 0) + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + else + ret = PTR_ERR(device); goto out; + } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; @@ -2163,30 +2169,21 @@ static struct btrfs_device *btrfs_find_device_missing_or_by_path( /* * Lookup a device given by device id, or the path if the id is 0. */ -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device) +struct btrfs_device *btrfs_find_device_by_devspec( + struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) { - int ret = 0; + struct btrfs_device *device; if (devid) { - *device = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!*device) - ret = -ENOENT; + device = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!device) + return ERR_PTR(-ENOENT); } else { if (!devpath || !devpath[0]) - return -EINVAL; - - *device = btrfs_find_device_missing_or_by_path(fs_info, devpath); - if (IS_ERR(*device)) { - if (PTR_ERR(*device) == -ENOENT && - strcmp(devpath, "missing") == 0) - ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - else - ret = PTR_ERR(*device); - } + return ERR_PTR(-EINVAL); + device = btrfs_find_device_missing_or_by_path(fs_info, devpath); } - return ret; + return device; } /* diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e7811473024d..aefce895e994 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -410,9 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device); +struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, + u64 devid, + const char *devpath); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); From 3b2fd8016069de59f73caca06f7ad60c25f3eb92 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sat, 8 Sep 2018 04:41:40 +0800 Subject: [PATCH 30/93] Btrfs: use args in the correct order for kcalloc in btrfsic_read_block kcalloc is defined as: kcalloc(size_t n, size_t size, gfp_t flags) Although this won't cause problems in practice, btrfsic_read_block() uses kcalloc with n and size in the opposite order. Reviewed-by: Omar Sandoval Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 833cf3c35b4d..2e43fba44035 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, { unsigned int num_pages; unsigned int i; + size_t size; u64 dev_bytenr; int ret; @@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; - block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) + - sizeof(*block_ctx->pagev), - num_pages, GFP_NOFS); + size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); + block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); if (!block_ctx->mem_to_free) return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; From 3cf5068f3d068d788871b0da1a0348048e68b9d6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 12 Sep 2018 06:06:26 +0800 Subject: [PATCH 31/93] Btrfs: unify error handling of btrfs_lookup_dir_item Unify the error handling of directory item lookups using IS_ERR_OR_NULL. No functional changes. Signed-off-by: Liu Bo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 21 +++++---------------- fs/btrfs/send.c | 8 ++------ 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8a954c47f65..f2c38f5ff6e2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3907,12 +3907,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto err; - } - if (!di) { - ret = -ENOENT; + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto err; } leaf = path->nodes[0]; @@ -4072,10 +4068,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -5467,12 +5460,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), name, namelen, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bd5756504709..094cc1444a90 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root, di = btrfs_lookup_dir_item(NULL, root, path, dir, name, name_len, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); From 98e6b1eb4022f2eb9845f0da5f16c179e5f32b9f Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 12 Sep 2018 06:06:23 +0800 Subject: [PATCH 32/93] Btrfs: remove unnecessary level check in balance_level In the callchain: btrfs_search_slot() if (level != 0) setup_nodes_for_search() balance_level() It is just impossible to have level=0 in balance_level, we can drop the check. Signed-off-by: Liu Bo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1124d236291d..6178fadf80a1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1815,8 +1815,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int orig_slot = path->slots[level]; u64 orig_ptr; - if (level == 0) - return 0; + ASSERT(level > 0); mid = path->nodes[level]; From 51995c399b73dacb9d84375ad5a8fda3aced03ab Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 14 Sep 2018 01:46:08 +0800 Subject: [PATCH 33/93] Btrfs: assert page dirty bit on extent buffer pages Just in case that someone breaks the rule that pages are dirty as long as eb is dirty. The next patch will dirty the pages conditionally. Signed-off-by: Liu Bo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43bf4ec891c6..ce2aa692b444 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5176,6 +5176,12 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) for (i = 0; i < num_pages; i++) set_page_dirty(eb->pages[i]); + +#ifdef CONFIG_BTRFS_DEBUG + for (i = 0; i < num_pages; i++) + ASSERT(PageDirty(eb->pages[i])); +#endif + return was_dirty; } From abb57ef3ff9720c42bbc06bcd1788da9ce1a3eb8 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 14 Sep 2018 01:44:42 +0800 Subject: [PATCH 34/93] Btrfs: skip set_page_dirty if eb pages are already dirty As long as @eb is marked with EXTENT_BUFFER_DIRTY, all of its pages are dirty, so no need to set pages dirty again. Ftrace showed that the loop took 10us on my dev box, so removing this can save us at least 10us if eb is already dirty and otherwise avoid a potentially expensive calls to set_page_dirty. Signed-off-by: Liu Bo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++---- fs/btrfs/extent_io.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ce2aa692b444..ba2a13e908a9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5160,11 +5160,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_buffer *eb) +bool set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; - int was_dirty = 0; + bool was_dirty; check_buffer_tree_ref(eb); @@ -5174,8 +5174,9 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); - for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + if (!was_dirty) + for (i = 0; i < num_pages; i++) + set_page_dirty(eb->pages[i]); #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b4d03e677e1d..f2ab42d57f02 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_buffer *eb); +bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); From 4183c52ce8871544ea94054ee47c685ecae714f6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 12 Sep 2018 06:06:22 +0800 Subject: [PATCH 35/93] Btrfs: remove wait_ordered_range in btrfs_evict_inode When we delete an inode, btrfs_evict_inode() { truncate_inode_pages_final() truncate_inode_pages_range() lock_page() truncate_cleanup_page() btrfs_invalidatepage() wait_on_page_writeback btrfs_lookup_ordered_range() cancel_dirty_page() unlock_page() ... btrfs_wait_ordered_range() ... As VFS has called ->invalidatepage() to get all ordered extents done (if there are any) and truncated all page cache pages (no dirty pages to writeback after this step), wait_ordered_range() is just a noop. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f2c38f5ff6e2..e80304e4f16a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5362,9 +5362,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - if (!special_file(inode->i_mode)) - btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); From 28bee489828ca7da9f9a0247a3fbe695fc0df6b4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 13 Sep 2018 11:35:00 +0300 Subject: [PATCH 36/93] btrfs: Remove logically dead code from btrfs_orphan_cleanup In btrfs_orphan_cleanup the final 'if (ret) goto out' cannot ever be executed. This is due to the last assignment to 'ret' depending on the return value of btrfs_iget. If an error other than -ENOENT is returned then the loop is prematurely terminated by 'goto out'. On the other hand, if the error value is ENOENT then a subsequent if branch is executed that always re-assigns 'ret' and in case it's an error just terminates the loop. No functional changes. Coverity-id: 1437392 Signed-off-by: Nikolay Borisov Reviewed-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e80304e4f16a..8ba7e5d5071e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3468,8 +3468,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* this will do delete_inode and everything for us */ iput(inode); - if (ret) - goto out; } /* release the path since we're done with it */ btrfs_release_path(path); From 315bed43fea532650933e7bba316a7601d439edf Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 13 Sep 2018 11:35:10 +0300 Subject: [PATCH 37/93] btrfs: handle error of get_old_root In btrfs_search_old_slot get_old_root is always used with the assumption it cannot fail. However, this is not true in rare circumstance it can fail and return null. This will lead to null point dereference when the header is read. Fix this by checking the return value and properly handling NULL by setting ret to -EIO and returning gracefully. Coverity-id: 1087503 Signed-off-by: Nikolay Borisov Reviewed-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6178fadf80a1..0a6c645fab0a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2960,6 +2960,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = get_old_root(root, time_seq); + if (!b) { + ret = -EIO; + goto done; + } level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; From c1766dd7829802b3a451682234497a8539763bd1 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 13 Sep 2018 11:01:15 +0800 Subject: [PATCH 38/93] btrfs: change remove_extent_mapping to return void remove_extent_mapping uses the variable "ret" for return value, but it is not modified after initialzation. Further, I find that any of the callers do not handle the return value and the callees are only simple functions so the return values does not need to be passed. Signed-off-by: zhong jiang Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 5 +---- fs/btrfs/extent_map.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6648d55e5339..2382b949cef8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -428,16 +428,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Removes @em from @tree. No reference counts are dropped, and no checks * are done to see if the range is in use */ -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { - int ret = 0; - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); - return ret; } void replace_extent_mapping(struct extent_map_tree *tree, diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 25d985e7532a..a7ceb30449fc 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *cur, struct extent_map *new, From 5c06147128fbbdf7a84232c5f0d808f53153defe Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 15:52:17 -0400 Subject: [PATCH 39/93] btrfs: fix error handling in btrfs_dev_replace_start When we fail to start a transaction in btrfs_dev_replace_start, we leave dev_replace->replace_start set to STARTED but clear ->srcdev and ->tgtdev. Later, that can result in an Oops in btrfs_dev_replace_progress when having state set to STARTED or SUSPENDED implies that ->srcdev is valid. Also fix error handling when the state is already STARTED or SUSPENDED while starting. That, too, will clear ->srcdev and ->tgtdev even though it doesn't own them. This should be an impossible case to hit since we should be protected by the BTRFS_FS_EXCL_OP bit being set. Let's add an ASSERT there while we're at it. Fixes: e93c89c1aaaaa (Btrfs: add new sources for device replace code) CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4e2b67d06305..ff01740158aa 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -440,6 +440,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } @@ -482,6 +483,10 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; goto leave; } @@ -503,8 +508,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; btrfs_dev_replace_write_unlock(dev_replace); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; From 93bba24d4b5ad1e5cd8b43f64e66ff9d6355dd20 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Sep 2018 14:16:23 +0800 Subject: [PATCH 40/93] btrfs: Enhance btrfs_trim_fs function to handle error better Function btrfs_trim_fs() doesn't handle errors in a consistent way. If error happens when trimming existing block groups, it will skip the remaining blocks and continue to trim unallocated space for each device. The return value will only reflect the final error from device trimming. This patch will fix such behavior by: 1) Recording the last error from block group or device trimming The return value will also reflect the last error during trimming. Make developer more aware of the problem. 2) Continuing trimming if possible If we failed to trim one block group or device, we could still try the next block group or device. 3) Report number of failures during block group and device trimming It would be less noisy, but still gives user a brief summary of what's going wrong. Such behavior can avoid confusion for cases like failure to trim the first block group and then only unallocated space is trimmed. Reported-by: Chris Murphy CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ add bg_ret and dev_ret to the messages ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 49 ++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 528f3308c32b..5dbb3f713125 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10833,6 +10833,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, return ret; } +/* + * Trim the whole filesystem by: + * 1) trimming the free space in each block group + * 2) trimming the unallocated space on each device + * + * This will also continue trimming even if a block group or device encounters + * an error. The return value will be the last error, or 0 if nothing bad + * happens. + */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { struct btrfs_block_group_cache *cache = NULL; @@ -10843,6 +10852,10 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) u64 end; u64 trimmed = 0; u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + u64 bg_failed = 0; + u64 dev_failed = 0; + int bg_ret = 0; + int dev_ret = 0; int ret = 0; /* @@ -10853,7 +10866,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) else cache = btrfs_lookup_block_group(fs_info, range->start); - while (cache) { + for (; cache; cache = next_block_group(fs_info, cache)) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); break; @@ -10867,13 +10880,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } ret = wait_block_group_cache_done(cache); if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } } ret = btrfs_trim_block_group(cache, @@ -10884,28 +10899,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } } - - cache = next_block_group(fs_info, cache); } + if (bg_failed) + btrfs_warn(fs_info, + "failed to trim %llu block group(s), last error %d", + bg_failed, bg_ret); mutex_lock(&fs_info->fs_devices->device_list_mutex); devices = &fs_info->fs_devices->alloc_list; list_for_each_entry(device, devices, dev_alloc_list) { ret = btrfs_trim_free_extents(device, range->minlen, &group_trimmed); - if (ret) + if (ret) { + dev_failed++; + dev_ret = ret; break; + } trimmed += group_trimmed; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (dev_failed) + btrfs_warn(fs_info, + "failed to trim %llu device(s), last error %d", + dev_failed, dev_ret); range->len = trimmed; - return ret; + if (bg_ret) + return bg_ret; + return dev_ret; } /* From 6ba9fc8e628becf0e3ec94083450d089b0dec5f5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Sep 2018 14:16:24 +0800 Subject: [PATCH 41/93] btrfs: Ensure btrfs_trim_fs can trim the whole filesystem [BUG] fstrim on some btrfs only trims the unallocated space, not trimming any space in existing block groups. [CAUSE] Before fstrim_range passed to btrfs_trim_fs(), it gets truncated to range [0, super->total_bytes). So later btrfs_trim_fs() will only be able to trim block groups in range [0, super->total_bytes). While for btrfs, any bytenr aligned to sectorsize is valid, since btrfs uses its logical address space, there is nothing limiting the location where we put block groups. For filesystem with frequent balance, it's quite easy to relocate all block groups and bytenr of block groups will start beyond super->total_bytes. In that case, btrfs will not trim existing block groups. [FIX] Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs() can get the unmodified range, which is normally set to [0, U64_MAX]. Reported-by: Chris Murphy Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl") CC: # v4.4+ Signed-off-by: Qu Wenruo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 10 +--------- fs/btrfs/ioctl.c | 11 +++++++---- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5dbb3f713125..da3257585e29 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10851,21 +10851,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); u64 bg_failed = 0; u64 dev_failed = 0; int bg_ret = 0; int dev_ret = 0; int ret = 0; - /* - * try to trim all FS space, our block group may start from non-zero. - */ - if (range->len == total_bytes) - cache = btrfs_lookup_first_block_group(fs_info, range->start); - else - cache = btrfs_lookup_block_group(fs_info, range->start); - + cache = btrfs_lookup_first_block_group(fs_info, range->start); for (; cache; cache = next_block_group(fs_info, cache)) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4905d13dee0a..a990a9045139 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; - if (range.start > total_bytes || - range.len < fs_info->sb->s_blocksize) + + /* + * NOTE: Don't truncate the range using super->total_bytes. Bytenr of + * block group is in the logical address space, which can be any + * sectorsize aligned bytenr in the range [0, U64_MAX]. + */ + if (range.len < fs_info->sb->s_blocksize) return -EINVAL; - range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); if (ret < 0) From d4e329de5e5e21594df2e0dd59da9acee71f133b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 17:18:14 -0400 Subject: [PATCH 42/93] btrfs: iterate all devices during trim, instead of fs_devices::alloc_list btrfs_trim_fs iterates over the fs_devices->alloc_list while holding the device_list_mutex. The problem is that ->alloc_list is protected by the chunk mutex. We don't want to hold the chunk mutex over the trim of the entire file system. Fortunately, the ->dev_list list is protected by the dev_list mutex and while it will give us all devices, including read-only devices, we already just skip the read-only devices. Then we can continue to take and release the chunk mutex while scanning each device. Fixes: 499f377f49f ("btrfs: iterate over unused chunk space in FITRIM") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index da3257585e29..41fb24da7afa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10903,8 +10903,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) "failed to trim %llu block group(s), last error %d", bg_failed, bg_ret); mutex_lock(&fs_info->fs_devices->device_list_mutex); - devices = &fs_info->fs_devices->alloc_list; - list_for_each_entry(device, devices, dev_alloc_list) { + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { ret = btrfs_trim_free_extents(device, range->minlen, &group_trimmed); if (ret) { From 0be88e367fd8fbdb45257615d691f4675dda062f Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 17:18:15 -0400 Subject: [PATCH 43/93] btrfs: don't attempt to trim devices that don't support it We check whether any device the file system is using supports discard in the ioctl call, but then we attempt to trim free extents on every device regardless of whether discard is supported. Due to the way we mask off EOPNOTSUPP, we can end up issuing the trim operations on each free range on devices that don't support it, just wasting time. Fixes: 499f377f49f08 ("btrfs: iterate over unused chunk space in FITRIM") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 41fb24da7afa..6e1c2a22874f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10772,6 +10772,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, *trimmed = 0; + /* Discard not supported = nothing to do. */ + if (!blk_queue_discard(bdev_get_queue(device->bdev))) + return 0; + /* Not writeable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; From fee7acc361314df6561208c2d3c0882d663dd537 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 17:18:16 -0400 Subject: [PATCH 44/93] btrfs: keep trim from interfering with transaction commits Commit 499f377f49f08 (btrfs: iterate over unused chunk space in FITRIM) fixed free space trimming, but introduced latency when it was running. This is due to it pinning the transaction using both a incremented refcount and holding the commit root sem for the duration of a single trim operation. This was to ensure safety but it's unnecessary. We already hold the the chunk mutex so we know that the chunk we're using can't be allocated while we're trimming it. In order to check against chunks allocated already in this transaction, we need to check the pending chunks list. To to that safely without joining the transaction (or attaching than then having to commit it) we need to ensure that the dev root's commit root doesn't change underneath us and the pending chunk lists stays around until we're done with it. We can ensure the former by holding the commit root sem and the latter by pinning the transaction. We do this now, but the critical section covers the trim operation itself and we don't need to do that. This patch moves the pinning and unpinning logic into helpers and unpins the transaction after performing the search and check for pending chunks. Limiting the critical section of the transaction pinning improves the latency substantially on slower storage (e.g. image files over NFS). Fixes: 499f377f49f08 ("btrfs: iterate over unused chunk space in FITRIM") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1c2a22874f..f0524bdf49b3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10755,14 +10755,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * We don't want a transaction for this since the discard may take a * substantial amount of time. We don't require that a transaction be * running, but we do need to take a running transaction into account - * to ensure that we're not discarding chunks that were released in - * the current transaction. + * to ensure that we're not discarding chunks that were released or + * allocated in the current transaction. * * Holding the chunks lock will prevent other threads from allocating * or releasing chunks, but it won't prevent a running transaction * from committing and releasing the memory that the pending chunks * list head uses. For that, we need to take a reference to the - * transaction. + * transaction and hold the commit root sem. We only need to hold + * it while performing the free space search since we have already + * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 minlen, u64 *trimmed) @@ -10793,9 +10795,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) - return ret; + break; - down_read(&fs_info->commit_root_sem); + ret = down_read_killable(&fs_info->commit_root_sem); + if (ret) { + mutex_unlock(&fs_info->chunk_mutex); + break; + } spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; @@ -10803,13 +10809,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); + if (!trans) + up_read(&fs_info->commit_root_sem); + ret = find_free_dev_extent_start(trans, device, minlen, start, &start, &len); - if (trans) + if (trans) { + up_read(&fs_info->commit_root_sem); btrfs_put_transaction(trans); + } if (ret) { - up_read(&fs_info->commit_root_sem); mutex_unlock(&fs_info->chunk_mutex); if (ret == -ENOSPC) ret = 0; @@ -10817,7 +10827,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, } ret = btrfs_issue_discard(device->bdev, start, len, &bytes); - up_read(&fs_info->commit_root_sem); mutex_unlock(&fs_info->chunk_mutex); if (ret) From 3aa7c7a31c26321696b92841d5103461c6f3f517 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Sep 2018 10:45:45 -0400 Subject: [PATCH 45/93] btrfs: wait on caching when putting the bg cache While testing my backport I noticed there was a panic if I ran generic/416 generic/417 generic/418 all in a row. This just happened to uncover a race where we had outstanding IO after we destroy all of our workqueues, and then we'd go to queue the endio work on those free'd workqueues. This is because we aren't waiting for the caching threads to be done before freeing everything up, so to fix this make sure we wait on any outstanding caching that's being done before we free up the block group, so we're sure to be done with all IO by the time we get to btrfs_stop_all_workers(). This fixes the panic I was seeing consistently in testing. ------------[ cut here ]------------ kernel BUG at fs/btrfs/volumes.c:6112! SMP PTI Modules linked in: CPU: 1 PID: 27165 Comm: kworker/u4:7 Not tainted 4.16.0-02155-g3553e54a578d-dirty #875 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 Workqueue: btrfs-cache btrfs_cache_helper RIP: 0010:btrfs_map_bio+0x346/0x370 RSP: 0000:ffffc900061e79d0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff880071542e00 RCX: 0000000000533000 RDX: ffff88006bb74380 RSI: 0000000000000008 RDI: ffff880078160000 RBP: 0000000000000001 R08: ffff8800781cd200 R09: 0000000000503000 R10: ffff88006cd21200 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8800781cd200 R15: ffff880071542e00 FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000817ffc4 CR3: 0000000078314000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btree_submit_bio_hook+0x8a/0xd0 submit_one_bio+0x5d/0x80 read_extent_buffer_pages+0x18a/0x320 btree_read_extent_buffer_pages+0xbc/0x200 ? alloc_extent_buffer+0x359/0x3e0 read_tree_block+0x3d/0x60 read_block_for_search.isra.30+0x1a5/0x360 btrfs_search_slot+0x41b/0xa10 btrfs_next_old_leaf+0x212/0x470 caching_thread+0x323/0x490 normal_work_helper+0xc5/0x310 process_one_work+0x141/0x340 worker_thread+0x44/0x3c0 kthread+0xf8/0x130 ? process_one_work+0x340/0x340 ? kthread_bind+0x10/0x10 ret_from_fork+0x35/0x40 RIP: btrfs_map_bio+0x346/0x370 RSP: ffffc900061e79d0 ---[ end trace 827eb13e50846033 ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f0524bdf49b3..686542318215 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9615,6 +9615,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group = btrfs_lookup_first_block_group(info, last); while (block_group) { + wait_block_group_cache_done(block_group); spin_lock(&block_group->lock); if (block_group->iref) break; From 5c9d028b3b174e5cf3678a7b0c14e21e51665793 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 03:51:49 +0800 Subject: [PATCH 46/93] Btrfs: delayed-refs: use rb_first_cached for href_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rb_first_cached() trades an extra pointer "leftmost" for doing the same job as rb_first() but in O(1). Functions manipulating href_root need to get the first entry, this converts href_root to use rb_first_cached(). This patch is first in the sequenct of similar updates to other rbtrees and this is analysis of the expected behaviour and improvements. There's a common pattern: while (node = rb_first) { entry = rb_entry(node) next = rb_next(node) rb_erase(node) cleanup(entry) } rb_first needs to traverse the tree up to logN depth, rb_erase can completely reshuffle the tree. With the caching we'll skip the traversal in rb_first. That's a cached memory access vs looped pointer dereference trade-off that IMHO has a clear winner. Measurements show there's not much difference in a sample tree with 10000 nodes: 4.5s / rb_first and 4.8s / rb_first_cached. Real effects of caching and pointer chasing are unpredictable though. Further optimzations can be done to avoid the expensive rb_erase step. In some cases it's ok to process the nodes in any order, so the tree can be traversed in post-order, not rebalancing the children nodes and just calling free. Care must be taken regarding the next node. Tested-by: Holger Hoffstätte Signed-off-by: Liu Bo Reviewed-by: David Sterba [ update changelog from mail discussions ] Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 30 +++++++++++++++++------------- fs/btrfs/delayed-ref.h | 2 +- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/transaction.c | 5 +++-- 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 62ff545ba1f7..f07952e16a3b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, } /* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, struct rb_node *node) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_head *entry; struct btrfs_delayed_ref_head *ins; u64 bytenr; + bool leftmost = true; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); bytenr = ins->bytenr; @@ -117,16 +118,18 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->bytenr) + if (bytenr < entry->bytenr) { p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) + } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return entry; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); return NULL; } @@ -164,10 +167,11 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, * If return_bigger is given, the next bigger entry is returned if no exact * match is found. */ -static struct btrfs_delayed_ref_head * -find_ref_head(struct rb_root *root, u64 bytenr, - int return_bigger) +static struct btrfs_delayed_ref_head* find_ref_head( + struct btrfs_delayed_ref_root *dr, u64 bytenr, + int return_bigger) { + struct rb_root *root = &dr->href_root.rb_root; struct rb_node *n; struct btrfs_delayed_ref_head *entry; @@ -187,7 +191,7 @@ find_ref_head(struct rb_root *root, u64 bytenr, if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first(root); + n = rb_first_cached(&dr->href_root); entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); return entry; @@ -357,12 +361,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans) again: start = delayed_refs->run_delayed_start; - head = find_ref_head(&delayed_refs->href_root, start, 1); + head = find_ref_head(delayed_refs, start, 1); if (!head && !loop) { delayed_refs->run_delayed_start = 0; start = 0; loop = true; - head = find_ref_head(&delayed_refs->href_root, start, 1); + head = find_ref_head(delayed_refs, start, 1); if (!head) return NULL; } else if (!head && loop) { @@ -903,7 +907,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { - return find_ref_head(&delayed_refs->href_root, bytenr, 0); + return find_ref_head(delayed_refs, bytenr, 0); } void __cold btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d9f2a4ebd5db..88438b6cee45 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_root { /* head ref rbtree */ - struct rb_root href_root; + struct rb_root_cached href_root; /* dirty extent records */ struct rb_root dirty_extent_root; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3611df2ce5c1..a21a96786813 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4203,7 +4203,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } - while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; bool pin_bytes = false; @@ -4239,7 +4239,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 686542318215..30b3d8561768 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2454,7 +2454,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return 1; } delayed_refs->num_heads--; - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -2940,7 +2940,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first(&delayed_refs->href_root); + node = rb_first_cached(&delayed_refs->href_root); if (!node) { spin_unlock(&delayed_refs->lock); goto out; @@ -6929,7 +6929,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e7856e15adbf..3b1cc978d409 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.href_root.rb_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -245,7 +246,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); From e3d039656384288bbe952413d8d404b3035fe7d7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 03:51:50 +0800 Subject: [PATCH 47/93] Btrfs: delayed-refs: use rb_first_cached for ref_tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rb_first_cached() trades an extra pointer "leftmost" for doing the same job as rb_first() but in O(1). Functions manipulating href->ref_tree need to get the first entry, this converts href->ref_tree to use rb_first_cached(). For more details about the optimization see patch "Btrfs: delayed-refs: use rb_first_cached for href_root". Tested-by: Holger Hoffstätte Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- fs/btrfs/delayed-ref.c | 24 ++++++++++++++---------- fs/btrfs/delayed-ref.h | 2 +- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 13 +++++++------ 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 84006e3dd105..1854835e082b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -769,7 +769,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); spin_lock(&head->lock); - for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { + for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); if (node->seq > seq) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index f07952e16a3b..7f567c944fec 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -133,13 +133,14 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, return NULL; } -static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; + bool leftmost = true; while (*p) { int comp; @@ -148,16 +149,18 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, ref_node); comp = comp_refs(ins, entry, true); - if (comp < 0) + if (comp < 0) { p = &(*p)->rb_left; - else if (comp > 0) + } else if (comp > 0) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return entry; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); return NULL; } @@ -231,7 +234,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref) { lockdep_assert_held(&head->lock); - rb_erase(&ref->ref_node, &head->ref_tree); + rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -300,7 +303,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, lockdep_assert_held(&head->lock); - if (RB_EMPTY_ROOT(&head->ref_tree)) + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return; /* We don't have too many refs to merge for data. */ @@ -318,7 +321,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&fs_info->tree_mod_seq_lock); again: - for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; @@ -573,7 +577,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; head_ref->is_system = is_system; - head_ref->ref_tree = RB_ROOT; + head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 88438b6cee45..c3e3486a126c 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_tree; + struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a21a96786813..517484ecad96 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4221,11 +4221,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((n = rb_first(&head->ref_tree)) != NULL) { + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); ref->in_tree = 0; - rb_erase(&ref->ref_node, &head->ref_tree); + rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 30b3d8561768..26fb9cbf3807 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2374,7 +2374,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; - if (RB_EMPTY_ROOT(&head->ref_tree)) + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return NULL; /* @@ -2387,7 +2387,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); - ref = rb_entry(rb_first(&head->ref_tree), + ref = rb_entry(rb_first_cached(&head->ref_tree), struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; @@ -2448,7 +2448,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&head->lock); spin_lock(&delayed_refs->lock); spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); return 1; @@ -2597,7 +2597,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, actual_count++; ref->in_tree = 0; - rb_erase(&ref->ref_node, &locked_ref->ref_tree); + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -3040,7 +3040,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, * XXX: We should replace this with a proper search function in the * future. */ - for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -6908,7 +6909,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree)) + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; if (head->extent_op) { From 03a1d4c891634dd5b98da865fb783e8b22d4d027 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 03:51:51 +0800 Subject: [PATCH 48/93] Btrfs: delayed-inode: use rb_first_cached for ins_root and del_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rb_first_cached() trades an extra pointer "leftmost" for doing the same job as rb_first() but in O(1). Functions manipulating delayed_item need to get the first entry, this converts it to use rb_first_cached(). For more details about the optimization see patch "Btrfs: delayed-refs: use rb_first_cached for href_root". Tested-by: Holger Hoffstätte Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 29 ++++++++++++++++------------- fs/btrfs/delayed-inode.h | 4 ++-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 47ce74cf134a..15dcbd6ef531 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); - delayed_node->ins_root = RB_ROOT; - delayed_node->del_root = RB_ROOT; + delayed_node->ins_root = RB_ROOT_CACHED; + delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); @@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( struct btrfs_delayed_node *delayed_node, struct btrfs_key *key) { - return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key, NULL, NULL); } @@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, { struct rb_node **p, *node; struct rb_node *parent_node = NULL; - struct rb_root *root; + struct rb_root_cached *root; struct btrfs_delayed_item *item; int cmp; + bool leftmost = true; if (action == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; @@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, root = &delayed_node->del_root; else BUG(); - p = &root->rb_node; + p = &root->rb_root.rb_node; node = &ins->rb_node; while (*p) { @@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_node); cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) + if (cmp < 0) { p = &(*p)->rb_right; - else if (cmp > 0) + leftmost = false; + } else if (cmp > 0) { p = &(*p)->rb_left; - else + } else { return -EEXIST; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; ins->ins_or_del = action; @@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { - struct rb_root *root; + struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; @@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) else root = &delayed_item->delayed_node->del_root; - rb_erase(&delayed_item->rb_node, root); + rb_erase_cached(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; finish_one_item(delayed_root); @@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct rb_node *p; struct btrfs_delayed_item *item = NULL; - p = rb_first(&delayed_node->ins_root); + p = rb_first_cached(&delayed_node->ins_root); if (p) item = rb_entry(p, struct btrfs_delayed_item, rb_node); @@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct rb_node *p; struct btrfs_delayed_item *item = NULL; - p = rb_first(&delayed_node->del_root); + p = rb_first_cached(&delayed_node->del_root); if (p) item = rb_entry(p, struct btrfs_delayed_item, rb_node); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 33536cd681d4..74ae226ffaf0 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -50,8 +50,8 @@ struct btrfs_delayed_node { * is waiting to be dealt with by the async worker. */ struct list_head p_list; - struct rb_root ins_root; - struct rb_root del_root; + struct rb_root_cached ins_root; + struct rb_root_cached del_root; struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; From 07e1ce096db3605f3e0c98695df66a51e2be9f05 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 03:51:52 +0800 Subject: [PATCH 49/93] Btrfs: extent_map: use rb_first_cached MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rb_first_cached() trades an extra pointer "leftmost" for doing the same job as rb_first() but in O(1). As evict_inode_truncate_pages() removes all extent mapping by always looking for the first rb entry, it's helpful to use rb_first_cached instead. For more details about the optimization see patch "Btrfs: delayed-refs: use rb_first_cached for href_root". Tested-by: Holger Hoffstätte Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 27 +++++++++++++++------------ fs/btrfs/extent_map.h | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/tests/extent-map-tests.c | 4 ++-- fs/btrfs/volumes.c | 4 ++-- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2382b949cef8..7eea8b6e2cd3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -34,7 +34,7 @@ void __cold extent_map_exit(void) */ void extent_map_tree_init(struct extent_map_tree *tree) { - tree->map = RB_ROOT; + tree->map = RB_ROOT_CACHED; INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static int tree_insert(struct rb_root *root, struct extent_map *em) +static int tree_insert(struct rb_root_cached *root, struct extent_map *em) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; struct extent_map *entry = NULL; struct rb_node *orig_parent = NULL; u64 end = range_end(em->start, em->len); + bool leftmost = true; while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - if (em->start < entry->start) + if (em->start < entry->start) { p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + } else if (em->start >= extent_map_end(entry)) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return -EEXIST; + } } orig_parent = parent; @@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); - rb_insert_color(&em->rb_node, root); + rb_insert_color_cached(&em->rb_node, root, leftmost); return 0; } @@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - rb_erase(&merge->rb_node, &tree->map); + rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } @@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (rb && mergable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; - rb_erase(&merge->rb_node, &tree->map); + rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); @@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map, start, &prev, &next); + rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next); if (!rb_node) { if (prev) rb_node = prev; @@ -431,7 +434,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - rb_erase(&em->rb_node, &tree->map); + rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); @@ -446,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) list_del_init(&cur->list); - rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); setup_extent_mapping(tree, new, modified); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index a7ceb30449fc..31977ffd6190 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -49,7 +49,7 @@ struct extent_map { }; struct extent_map_tree { - struct rb_root map; + struct rb_root_cached map; struct list_head modified_extents; rwlock_t lock; }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ba7e5d5071e..dd2140d3e354 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5222,10 +5222,10 @@ static void evict_inode_truncate_pages(struct inode *inode) truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map)) { + while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { struct extent_map *em; - node = rb_first(&map_tree->map); + node = rb_first_cached(&map_tree->map); em = rb_entry(node, struct extent_map, rb_node); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &em->flags); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 385a5316e4bf..bf15d3a7f20e 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; - while (!RB_EMPTY_ROOT(&em_tree->map)) { - node = rb_first(&em_tree->map); + while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { + node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); remove_extent_mapping(em_tree, em); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 86c2330ac83f..909c578506ee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) em_tree = &fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); - n = rb_last(&em_tree->map); + n = rb_last(&em_tree->map.rb_root); if (n) { em = rb_entry(n, struct extent_map, rb_node); ret = em->start + em->len; @@ -7445,7 +7445,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) int ret = 0; read_lock(&em_tree->lock); - for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { em = rb_entry(node, struct extent_map, rb_node); if (em->map_lookup->num_stripes != em->map_lookup->verified_stripes) { From ecf160b424ee648a14116079ff72d7d1241e377d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Aug 2018 03:51:53 +0800 Subject: [PATCH 50/93] Btrfs: preftree: use rb_first_cached MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rb_first_cached() trades an extra pointer "leftmost" for doing the same job as rb_first() but in O(1). While resolving indirect refs and missing refs, it always looks for the first rb entry in a while loop, it's helpful to use rb_first_cached instead. For more details about the optimization see patch "Btrfs: delayed-refs: use rb_first_cached for href_root". Tested-by: Holger Hoffstätte Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 1854835e082b..68ebe188446a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, } struct preftree { - struct rb_root root; + struct rb_root_cached root; unsigned int count; }; -#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 } +#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 } struct preftrees { struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */ @@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct prelim_ref *newref, struct share_check *sc) { - struct rb_root *root; + struct rb_root_cached *root; struct rb_node **p; struct rb_node *parent = NULL; struct prelim_ref *ref; int result; + bool leftmost = true; root = &preftree->root; - p = &root->rb_node; + p = &root->rb_root.rb_node; while (*p) { parent = *p; @@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, p = &(*p)->rb_left; } else if (result > 0) { p = &(*p)->rb_right; + leftmost = false; } else { /* Identical refs, merge them and free @newref */ struct extent_inode_elem *eie = ref->inode_list; @@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); - rb_insert_color(&newref->rbnode, root); + rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree) { struct prelim_ref *ref, *next_ref; - rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root, - rbnode) + rbtree_postorder_for_each_entry_safe(ref, next_ref, + &preftree->root.rb_root, rbnode) free_pref(ref); - preftree->root = RB_ROOT; + preftree->root = RB_ROOT_CACHED; preftree->count = 0; } @@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * freeing the entire indirect tree when we're done. In some test * cases, the tree can grow quite large (~200k objects). */ - while ((rnode = rb_first(&preftrees->indirect.root))) { + while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; ref = rb_entry(rnode, struct prelim_ref, rbnode); @@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, goto out; } - rb_erase(&ref->rbnode, &preftrees->indirect.root); + rb_erase_cached(&ref->rbnode, &preftrees->indirect.root); preftrees->indirect.count--; if (ref->count == 0) { @@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct preftree *tree = &preftrees->indirect_missing_keys; struct rb_node *node; - while ((node = rb_first(&tree->root))) { + while ((node = rb_first_cached(&tree->root))) { ref = rb_entry(node, struct prelim_ref, rbnode); - rb_erase(node, &tree->root); + rb_erase_cached(node, &tree->root); BUG_ON(ref->parent); /* should not be a direct ref */ BUG_ON(ref->key_for_search.type); @@ -1229,14 +1231,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (ret) goto out; - WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, extent_item_pos, total_refs, sc, ignore_offset); if (ret) goto out; - WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root)); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root)); /* * This walks the tree of merged and resolved refs. Tree blocks are @@ -1245,7 +1247,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, * * We release the entire tree in one go before returning. */ - node = rb_first(&preftrees.direct.root); + node = rb_first_cached(&preftrees.direct.root); while (node) { ref = rb_entry(node, struct prelim_ref, rbnode); node = rb_next(&ref->rbnode); From 9c36396c2a788facd4282a2b0646a1c4ac19847a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 Aug 2018 17:28:06 +0200 Subject: [PATCH 51/93] btrfs: tests: add separate stub for find_lock_delalloc_range The helper find_lock_delalloc_range is now conditionally built static, dpending on whether the self-tests are enabled or not. There's a macro that is supposed to hide the export, used only once. To discourage further use, drop it an add a public wrapper for the helper needed by tests. Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ fs/btrfs/extent_io.c | 13 ++++++++++++- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 10 +++++----- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 441610de9908..f4d5eeb2be20 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define STATIC noinline -#else -#define STATIC static noinline -#endif - #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ba2a13e908a9..6877a74c7469 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1563,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, * * 1 is returned if we find something, 0 if nothing was in the tree */ -STATIC u64 find_lock_delalloc_range(struct inode *inode, +static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes) @@ -1643,6 +1643,17 @@ STATIC u64 find_lock_delalloc_range(struct inode *inode, return found; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +u64 btrfs_find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) +{ + return find_lock_delalloc_range(inode, tree, locked_page, start, end, + max_bytes); +} +#endif + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f2ab42d57f02..369daa5d4f73 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -noinline u64 find_lock_delalloc_range(struct inode *inode, +u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d9269a531a4d..9e0f4a01be14 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("should have found at least one delalloc"); @@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("couldn't find delalloc in our range"); @@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (found) { test_err("found range when we shouldn't have"); @@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("didn't find our range"); @@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("didn't find our range"); From 57ec5fb478a3c27073ebc8f8f47a91eaaa083c13 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 Aug 2018 17:38:12 +0200 Subject: [PATCH 52/93] btrfs: tests: move testing members of struct btrfs_root to the end The data used only for tests are better placed at the end of the structure so that they don't change the structure layout. All new members of btrfs_root should be placed before. Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f4d5eeb2be20..0e637cfc90d1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1202,11 +1202,6 @@ struct btrfs_root { u64 highest_objectid; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ - u64 alloc_bytenr; -#endif - u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -1279,6 +1274,10 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + u64 alloc_bytenr; +#endif }; struct btrfs_file_private { From a654666a3474312ee76aa2bb9744376a46da3307 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 Aug 2018 17:44:13 +0200 Subject: [PATCH 53/93] btrfs: tests: group declarations of self-test helpers Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e637cfc90d1..4653c87d0ff5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3192,9 +3192,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); -#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3707,6 +3704,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); #endif From b2fa11547bc1693a8dd8e7379dd4e43d31f49c50 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 Aug 2018 17:48:13 +0200 Subject: [PATCH 54/93] btrfs: tests: polish ifdefs around testing helper Avoid the inline ifdefs and use two sections for self-tests enabled and disabled. Though there could be no ifdef and unconditional test_bit of BTRFS_FS_STATE_DUMMY_FS_INFO, the static inline can help to optimize out any code that would depend on conditions using btrfs_is_testing. As this is only for the testing code, drop unlikely(). Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4653c87d0ff5..36ade217e462 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3706,17 +3706,17 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); -#endif static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, - &fs_info->fs_state))) - return 1; -#endif + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} +#else +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ return 0; } +#endif static inline void cond_wake_up(struct wait_queue_head *wq) { From b1cdbcb53a6edd84d50b72117d49a350575bbe6a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Aug 2018 10:39:54 +0300 Subject: [PATCH 55/93] btrfs: Factor out ref head locking code in __btrfs_run_delayed_refs This is in preparation to refactor the giant loop in __btrfs_run_delayed_refs. As a first step define a new function which implements acquiring a reference to a btrfs_delayed_refs_head and use it. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 53 ++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 26fb9cbf3807..ee027a7fe7f4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2502,6 +2502,39 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return 0; } +static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( + struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_delayed_ref_head *head = NULL; + int ret; + + spin_lock(&delayed_refs->lock); + head = btrfs_select_ref_head(trans); + if (!head) { + spin_unlock(&delayed_refs->lock); + return head; + } + + /* + * Grab the lock that says we are going to process all the refs for + * this head + */ + ret = btrfs_delayed_ref_lock(trans, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (ret == -EAGAIN) + head = ERR_PTR(-EAGAIN); + + return head; +} + /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. @@ -2526,24 +2559,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (count >= nr) break; - spin_lock(&delayed_refs->lock); - locked_ref = btrfs_select_ref_head(trans); - if (!locked_ref) { - spin_unlock(&delayed_refs->lock); + locked_ref = btrfs_obtain_ref_head(trans); + if (!locked_ref) break; - } - - /* grab the lock that says we are going to process - * all the refs for this head */ - ret = btrfs_delayed_ref_lock(trans, locked_ref); - spin_unlock(&delayed_refs->lock); - /* - * we may have dropped the spin lock to get the head - * mutex lock, and that might have given someone else - * time to free the head. If that's true, it has been - * removed from our list and we can move on. - */ - if (ret == -EAGAIN) { + else if (PTR_ERR(locked_ref) == -EAGAIN) { locked_ref = NULL; count++; continue; From e726138676f896146a55a98305665e81d34d038c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Aug 2018 10:39:55 +0300 Subject: [PATCH 56/93] btrfs: Factor out loop processing all refs of a head This patch introduces a new helper encompassing the implicit inner loop in __btrfs_run_delayed_refs which processes all the refs for a given head. The code is mostly copy/paste, the only difference is that if we detect a newer reference then -EAGAIN is returned so that callers can react correctly. Also, at the end of the loop the head is relocked and btrfs_merge_delayed_refs is run again to retain the pre-refactoring semantics. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 77 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ee027a7fe7f4..979702f361cc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2535,6 +2535,83 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( return head; } +static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *locked_ref, + unsigned long *run_refs) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_delayed_ref_node *ref; + int must_insert_reserved = 0; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + + while ((ref = select_delayed_ref(locked_ref))) { + if (ref->seq && + btrfs_check_delayed_seq(fs_info, ref->seq)) { + spin_unlock(&locked_ref->lock); + unselect_delayed_ref_head(delayed_refs, locked_ref); + return -EAGAIN; + } + + (*run_refs)++; + ref->in_tree = 0; + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + /* + * When we play the delayed ref, also correct the ref_mod on + * head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + atomic_dec(&delayed_refs->num_entries); + + /* + * Record the must_insert_reserved flag before we drop the + * spin lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + spin_unlock(&locked_ref->lock); + + ret = run_one_delayed_ref(trans, ref, extent_op, + must_insert_reserved); + + btrfs_free_delayed_extent_op(extent_op); + if (ret) { + unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_put_delayed_ref(ref); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", + ret); + return ret; + } + + btrfs_put_delayed_ref(ref); + cond_resched(); + + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + } + + return 0; +} + /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. From 0110a4c43451533de1ea1bbdc57b5d452f9d8b25 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Aug 2018 10:39:56 +0300 Subject: [PATCH 57/93] btrfs: refactor __btrfs_run_delayed_refs loop Refactor the delayed refs loop by using the newly introduced btrfs_run_delayed_refs_for_head function. This greatly simplifies __btrfs_run_delayed_refs and makes it more obvious what is happening. We now have 1 loop which iterates the existing delayed_heads and then each selected ref head is processed by the new helper. All existing semantics of the code are preserved so no functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 108 +++++++++++------------------------------ 1 file changed, 28 insertions(+), 80 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 979702f361cc..8798fa029ebe 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2548,6 +2548,9 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; + lockdep_assert_held(&locked_ref->mutex); + lockdep_assert_held(&locked_ref->lock); + while ((ref = select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { @@ -2621,31 +2624,25 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; - struct btrfs_delayed_extent_op *extent_op; ktime_t start = ktime_get(); int ret; unsigned long count = 0; unsigned long actual_count = 0; - int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; - while (1) { + do { if (!locked_ref) { - if (count >= nr) - break; - locked_ref = btrfs_obtain_ref_head(trans); - if (!locked_ref) - break; - else if (PTR_ERR(locked_ref) == -EAGAIN) { - locked_ref = NULL; - count++; - continue; + if (IS_ERR_OR_NULL(locked_ref)) { + if (PTR_ERR(locked_ref) == -EAGAIN) { + continue; + } else { + break; + } } + count++; } - /* * We need to try and merge add/drops of the same ref since we * can run into issues with relocate dropping the implicit ref @@ -2661,23 +2658,19 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - ref = select_delayed_ref(locked_ref); - - if (ref && ref->seq && - btrfs_check_delayed_seq(fs_info, ref->seq)) { - spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); - locked_ref = NULL; - cond_resched(); - count++; - continue; - } - - /* - * We're done processing refs in this ref_head, clean everything - * up and move on to the next ref_head. - */ - if (!ref) { + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, + &actual_count); + if (ret < 0 && ret != -EAGAIN) { + /* + * Error, btrfs_run_delayed_refs_for_head already + * unlocked everything so just bail out + */ + return ret; + } else if (!ret) { + /* + * Success, perform the usual cleanup of a processed + * head + */ ret = cleanup_ref_head(trans, locked_ref); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ @@ -2686,61 +2679,16 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else if (ret) { return ret; } - locked_ref = NULL; - count++; - continue; } - actual_count++; - ref->in_tree = 0; - rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - /* - * When we play the delayed ref, also correct the ref_mod on - * head - */ - switch (ref->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - locked_ref->ref_mod -= ref->ref_mod; - break; - case BTRFS_DROP_DELAYED_REF: - locked_ref->ref_mod += ref->ref_mod; - break; - default: - WARN_ON(1); - } - atomic_dec(&delayed_refs->num_entries); - /* - * Record the must-insert_reserved flag before we drop the spin - * lock. + * Either success case or btrfs_run_delayed_refs_for_head + * returned -EAGAIN, meaning we need to select another head */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - spin_unlock(&locked_ref->lock); - - ret = run_one_delayed_ref(trans, ref, extent_op, - must_insert_reserved); - - btrfs_free_delayed_extent_op(extent_op); - if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); - btrfs_put_delayed_ref(ref); - btrfs_debug(fs_info, "run_one_delayed_ref returned %d", - ret); - return ret; - } - - btrfs_put_delayed_ref(ref); - count++; + locked_ref = NULL; cond_resched(); - } + } while ((nr != -1 && count < nr) || locked_ref); /* * We don't want to include ref heads since we can have empty ref heads From 818255feece6e2a432328020d78c8a81a153ce65 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 21 Sep 2018 14:26:34 +0200 Subject: [PATCH 58/93] btrfs: use common helper instead of open coding a bit test The helper does the same math and we take care about the special case when flags is 0 too. Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 909c578506ee..26eb388db343 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3691,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return (flags & (flags - 1)) == 0; + return is_power_of_2(flags); } static inline int balance_need_close(struct btrfs_fs_info *fs_info) From 7703bdd8d23e6ef057af3253958a793ec6066b28 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 20 Jun 2018 07:56:11 -0700 Subject: [PATCH 59/93] Btrfs: don't clean dirty pages during buffered writes During buffered writes, we follow this basic series of steps: again: lock all the pages wait for writeback on all the pages Take the extent range lock wait for ordered extents on the whole range clean all the pages if (copy_from_user_in_atomic() hits a fault) { drop our locks goto again; } dirty all the pages release all the locks The extra waiting, cleaning and locking are there to make sure we don't modify pages in flight to the drive, after they've been crc'd. If some of the pages in the range were already dirty when the write began, and we need to goto again, we create a window where a dirty page has been cleaned and unlocked. It may be reclaimed before we're able to lock it again, which means we'll read the old contents off the drive and lose any modifications that had been pending writeback. We don't actually need to clean the pages. All of the other locking in place makes sure we don't start IO on the pages, so we can just leave them dirty for the duration of the write. Fixes: 73d59314e6ed (the original btrfs merge) CC: stable@vger.kernel.org # v4.4+ Signed-off-by: Chris Mason Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d254cf94545f..15b925142793 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, end_of_last_block = start_pos + num_bytes - 1; + /* + * The pages may have already been dirty, clear out old accounting so + * we can set things up properly + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { if (start_pos >= isize && !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { @@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - clear_extent_bit(&inode->io_tree, start_pos, last_pos, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state); + *lockstart = start_pos; *lockend = last_pos; ret = 1; } + /* + * It's possible the pages are dirty right now, but we don't want + * to clean them yet because copy_from_user may catch a page fault + * and we might have to fall back to one page at a time. If that + * happens, we'll unlock these pages and we'd have a window where + * reclaim could sneak in and drop the once-dirty page on the floor + * without writing it. + * + * We have the pages locked and the extent range locked, so there's + * no way someone can start IO on any dirty pages in this range. + * + * We'll call btrfs_dirty_pages() later on, and that will flip around + * delalloc bits and dirty the pages as required. + */ for (i = 0; i < num_pages; i++) { - if (clear_page_dirty_for_io(pages[i])) - account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } From 4779cc04248deff676c56ff9f3b2c679388a7d5e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 24 Sep 2018 15:16:55 -0700 Subject: [PATCH 60/93] Btrfs: get rid of btrfs_symlink_aops The only aops we define for symlinks are identical to the aops for regular files. This has been the case since symlink support was added in commit 2b8d99a723a3 ("Btrfs: symlinks and hard links"). As far as I can tell, there wasn't a good reason to have separate aops then, and there isn't now, so let's just do what most other filesystems do and reuse the same structure. Signed-off-by: Omar Sandoval Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd2140d3e354..ae74c740f3e1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; -static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; static const struct extent_io_ops btrfs_extent_io_ops; @@ -3733,7 +3732,7 @@ static int btrfs_read_locked_inode(struct inode *inode) case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -10174,7 +10173,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); @@ -10550,13 +10549,6 @@ static const struct address_space_operations btrfs_aops = { .error_remove_page = generic_error_remove_page, }; -static const struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, -}; - static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, From fa6ac71524f206c33accfc12294fd35e17478fe0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 25 Sep 2018 14:37:46 +0800 Subject: [PATCH 61/93] btrfs: relocation: Add basic extent backref related comments for build_backref_tree fs/btrfs/relocation.c:build_backref_tree() is some code from 2009 era, although it works pretty fine, it's not that easy to understand. Especially combined with the complex btrfs backref format. This patch adds some basic comment for the backref build part of the code, making it less hard to read, at least for backref searching part. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 87afd3c395bb..a5c5e9b3aceb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, int level, u64 bytenr) { struct backref_cache *cache = &rc->backref_cache; - struct btrfs_path *path1; - struct btrfs_path *path2; + struct btrfs_path *path1; /* For searching extent root */ + struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ struct extent_buffer *eb; struct btrfs_root *root; struct backref_node *cur; @@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, struct btrfs_key key; unsigned long end; unsigned long ptr; - LIST_HEAD(list); + LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ LIST_HEAD(useless); int cowonly; int ret; @@ -778,6 +778,10 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, key.type != BTRFS_SHARED_BLOCK_REF_KEY); } + /* + * Parent node found and matches current inline ref, no need to + * rebuild this node for this inline ref. + */ if (exist && ((key.type == BTRFS_TREE_BLOCK_REF_KEY && exist->owner == key.offset) || @@ -787,11 +791,12 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, goto next; } + /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { if (key.objectid == key.offset) { /* - * only root blocks of reloc trees use - * backref of this type. + * Only root blocks of reloc trees use backref + * pointing to itself. */ root = find_reloc_root(rc, cur->bytenr); ASSERT(root); @@ -840,7 +845,11 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, goto next; } - /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset + * means the root objectid. We need to search the tree to get + * its parent bytenr. + */ root = read_fs_root(rc->extent_root->fs_info, key.offset); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -863,10 +872,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, level = cur->level + 1; - /* - * searching the tree to find upper level blocks - * reference the block. - */ + /* Search the tree to find parent blocks referring the block. */ path2->search_commit_root = 1; path2->skip_locking = 1; path2->lowest_level = level; @@ -893,6 +899,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, } lower = cur; need_check = true; + + /* Add all nodes and edges in the path */ for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == From c337e7b02f71c4b2f6f2138807a284d2c4e1ac5e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Sep 2018 14:42:29 +0800 Subject: [PATCH 62/93] btrfs: qgroup: Introduce trace event to analyse the number of dirty extents accounted Number of qgroup dirty extents is directly linked to the performance overhead, so add a new trace event, trace_qgroup_num_dirty_extents(), to record how many dirty extents is processed in btrfs_qgroup_account_extents(). This will be pretty handy to analyze later balance performance improvement. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 ++++ include/trace/events/btrfs.h | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index bdd8c0da6e32..8a03adc11f53 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2132,6 +2132,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; struct rb_node *node; + u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -2141,6 +2142,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { @@ -2186,6 +2188,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) kfree(record); } + trace_qgroup_num_dirty_extents(fs_info, trans->transid, + num_dirty_extents); return ret; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index abe3ff774f58..8568946f491d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1576,6 +1576,27 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, TP_ARGS(fs_info, rec) ); +TRACE_EVENT(qgroup_num_dirty_extents, + + TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, + u64 num_dirty_extents), + + TP_ARGS(fs_info, transid, num_dirty_extents), + + TP_STRUCT__entry_btrfs( + __field( u64, transid ) + __field( u64, num_dirty_extents ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->transid = transid; + __entry->num_dirty_extents = num_dirty_extents; + ), + + TP_printk_btrfs("transid=%llu num_dirty_extents=%llu", + __entry->transid, __entry->num_dirty_extents) +); + TRACE_EVENT(btrfs_qgroup_account_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr, From 25982561db7f43f29305704f9f24ff36ea7d5671 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Sep 2018 14:42:30 +0800 Subject: [PATCH 63/93] btrfs: qgroup: Introduce function to trace two swaped extents Introduce a new function, qgroup_trace_extent_swap(), which will be used later for balance qgroup speedup. The basis idea of balance is swapping tree blocks between reloc tree and the real file tree. The swap will happen in highest tree block, but there may be a lot of tree blocks involved. For example: OO = Old tree blocks NN = New tree blocks allocated during balance File tree (257) Reloc tree for 257 L2 OO NN / \ / \ L1 OO OO (a) OO NN (a) / \ / \ / \ / \ L0 OO OO OO OO OO OO NN NN (b) (c) (b) (c) When calling qgroup_trace_extent_swap(), we will pass: @src_eb = OO(a) @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] @dst_level = 0 @root_level = 1 In that case, qgroup_trace_extent_swap() will search from OO(a) to reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. The main work of qgroup_trace_extent_swap() can be split into 3 parts: 1) Tree search from @src_eb It should acts as a simplified btrfs_search_slot(). The key for search can be extracted from @dst_path->nodes[dst_level] (first key). 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. They should be marked during preivous (@dst_level = 1) iteration. 3) Mark file extents in leaves dirty We don't have good way to pick out new file extents only. So we still follow the old method by scanning all file extents in the leave. This function can free us from keeping two pathes, thus later we only need to care about how to iterate all new tree blocks in reloc tree. Signed-off-by: Qu Wenruo [ copy changelog to function comment ] Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8a03adc11f53..7086353153f3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1712,6 +1712,168 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) return 0; } +/* + * Helper function to trace a subtree tree block swap. + * + * The swap will happen in highest tree block, but there may be a lot of + * tree blocks involved. + * + * For example: + * OO = Old tree blocks + * NN = New tree blocks allocated during balance + * + * File tree (257) Reloc tree for 257 + * L2 OO NN + * / \ / \ + * L1 OO OO (a) OO NN (a) + * / \ / \ / \ / \ + * L0 OO OO OO OO OO OO NN NN + * (b) (c) (b) (c) + * + * When calling qgroup_trace_extent_swap(), we will pass: + * @src_eb = OO(a) + * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] + * @dst_level = 0 + * @root_level = 1 + * + * In that case, qgroup_trace_extent_swap() will search from OO(a) to + * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. + * + * The main work of qgroup_trace_extent_swap() can be split into 3 parts: + * + * 1) Tree search from @src_eb + * It should acts as a simplified btrfs_search_slot(). + * The key for search can be extracted from @dst_path->nodes[dst_level] + * (first key). + * + * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty + * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. + * They should be marked during preivous (@dst_level = 1) iteration. + * + * 3) Mark file extents in leaves dirty + * We don't have good way to pick out new file extents only. + * So we still follow the old method by scanning all file extents in + * the leave. + * + * This function can free us from keeping two pathes, thus later we only need + * to care about how to iterate all new tree blocks in reloc tree. + */ +static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int dst_level, int root_level) +{ + struct btrfs_key key; + struct btrfs_path *src_path; + struct btrfs_fs_info *fs_info = trans->fs_info; + u32 nodesize = fs_info->nodesize; + int cur_level = root_level; + int ret; + + BUG_ON(dst_level > root_level); + /* Level mismatch */ + if (btrfs_header_level(src_eb) != root_level) + return -EINVAL; + + src_path = btrfs_alloc_path(); + if (!src_path) { + ret = -ENOMEM; + goto out; + } + + if (dst_level) + btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + else + btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + + /* For src_path */ + extent_buffer_get(src_eb); + src_path->nodes[root_level] = src_eb; + src_path->slots[root_level] = dst_path->slots[root_level]; + src_path->locks[root_level] = 0; + + /* A simplified version of btrfs_search_slot() */ + while (cur_level >= dst_level) { + struct btrfs_key src_key; + struct btrfs_key dst_key; + + if (src_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + struct extent_buffer *eb; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + eb = src_path->nodes[cur_level + 1]; + parent_slot = src_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + src_path->nodes[cur_level] = eb; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + } + + src_path->slots[cur_level] = dst_path->slots[cur_level]; + if (cur_level) { + btrfs_node_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_node_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } else { + btrfs_item_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_item_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } + /* Content mismatch, something went wrong */ + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { + ret = -ENOENT; + goto out; + } + cur_level--; + } + + /* + * Now both @dst_path and @src_path have been populated, record the tree + * blocks for qgroup accounting. + */ + ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_extent(trans, + dst_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + + /* Record leaf file extents */ + if (dst_level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); + } +out: + btrfs_free_path(src_path); + return ret; +} + int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) From ea49f3e73c4b7252c1569906c1b2cd54605af3c9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Sep 2018 14:42:31 +0800 Subject: [PATCH 64/93] btrfs: qgroup: Introduce function to find all new tree blocks of reloc tree Introduce new function, qgroup_trace_new_subtree_blocks(), to iterate all new tree blocks in a reloc tree. So that qgroup could skip unrelated tree blocks during balance, which should hugely speedup balance speed when quota is enabled. The function qgroup_trace_new_subtree_blocks() itself only cares about new tree blocks in reloc tree. All its main works are: 1) Read out tree blocks according to parent pointers 2) Do recursive depth-first search Will call the same function on all its children tree blocks, with search level set to current level -1. And will also skip all children whose generation is smaller than @last_snapshot. 3) Call qgroup_trace_extent_swap() to trace tree blocks So although we have parameter list related to source file tree, it's not used at all, but only passed to qgroup_trace_extent_swap(). Thus despite the tree read code, the core should be pretty short and all about recursive depth-first search. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7086353153f3..0b49575698da 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1874,6 +1874,141 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, return ret; } +/* + * Helper function to do recursive generation-aware depth-first search, to + * locate all new tree blocks in a subtree of reloc tree. + * + * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) + * reloc tree + * L2 NN (a) + * / \ + * L1 OO NN (b) + * / \ / \ + * L0 OO OO OO NN + * (c) (d) + * If we pass: + * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], + * @cur_level = 1 + * @root_level = 1 + * + * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace + * above tree blocks along with their counter parts in file tree. + * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * won't affect OO(c). + */ +static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int cur_level, int root_level, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *eb; + bool need_cleanup = false; + int ret = 0; + int i; + + /* Level sanity check */ + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + root_level < cur_level) { + btrfs_err_rl(fs_info, + "%s: bad levels, cur_level=%d root_level=%d", + __func__, cur_level, root_level); + return -EUCLEAN; + } + + /* Read the tree block if needed */ + if (dst_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* + * dst_path->nodes[root_level] must be initialized before + * calling this function. + */ + if (cur_level == root_level) { + btrfs_err_rl(fs_info, + "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", + __func__, root_level, root_level, cur_level); + return -EUCLEAN; + } + + /* + * We need to get child blockptr/gen from parent before we can + * read it. + */ + eb = dst_path->nodes[cur_level + 1]; + parent_slot = dst_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + /* This node is old, no need to trace */ + if (child_gen < last_snapshot) + goto out; + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + dst_path->nodes[cur_level] = eb; + dst_path->slots[cur_level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + need_cleanup = true; + } + + /* Now record this tree block and its counter part for qgroups */ + ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, + root_level); + if (ret < 0) + goto cleanup; + + eb = dst_path->nodes[cur_level]; + + if (cur_level > 0) { + /* Iterate all child tree blocks */ + for (i = 0; i < btrfs_header_nritems(eb); i++) { + /* Skip old tree blocks as they won't be swapped */ + if (btrfs_node_ptr_generation(eb, i) < last_snapshot) + continue; + dst_path->slots[cur_level] = i; + + /* Recursive call (at most 7 times) */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, + dst_path, cur_level - 1, root_level, + last_snapshot); + if (ret < 0) + goto cleanup; + } + } + +cleanup: + if (need_cleanup) { + /* Clean up */ + btrfs_tree_unlock_rw(dst_path->nodes[cur_level], + dst_path->locks[cur_level]); + free_extent_buffer(dst_path->nodes[cur_level]); + dst_path->nodes[cur_level] = NULL; + dst_path->slots[cur_level] = 0; + dst_path->locks[cur_level] = 0; + } +out: + return ret; +} + int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) From 5f527822be40104e9056c981ff06c7750153a10a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Sep 2018 14:42:32 +0800 Subject: [PATCH 65/93] btrfs: qgroup: Use generation-aware subtree swap to mark dirty extents Before this patch, with quota enabled during balance, we need to mark the whole subtree dirty for quota. E.g. OO = Old tree blocks (from file tree) NN = New tree blocks (from reloc tree) File tree (src) Reloc tree (dst) OO (a) NN (a) / \ / \ (b) OO OO (c) (b) NN NN (c) / \ / \ / \ / \ OO OO OO OO (d) OO OO OO NN (d) For old balance + quota case, quota will mark the whole src and dst tree dirty, including all the 3 old tree blocks in reloc tree. It's doable for small file tree or new tree blocks are all located at lower level. But for large file tree or new tree blocks are all located at higher level, this will lead to mark the whole tree dirty, and be unbelievably slow. This patch will change how we handle such balance with quota enabled case. Now we will search from (b) and (c) for any new tree blocks whose generation is equal to @last_snapshot, and only mark them dirty. In above case, we only need to trace tree blocks NN(b), NN(c) and NN(d). (NN(a) will be traced when COW happens for nodeptr modification). And also for tree blocks OO(b), OO(c), OO(d). (OO(a) will be traced when COW happens for nodeptr modification.) For above case, we could skip 3 tree blocks, but for larger tree, we can skip tons of unmodified tree blocks, and hugely speed up balance. This patch will introduce a new function, btrfs_qgroup_trace_subtree_swap(), which will do the following main work: 1) Read out real root eb And setup basic dst_path for later calls 2) Call qgroup_trace_new_subtree_blocks() To trace all new tree blocks in reloc tree and their counter parts in the file tree. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 104 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/qgroup.h | 5 ++ fs/btrfs/relocation.c | 11 ++--- 3 files changed, 112 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 0b49575698da..6b35b3481085 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2009,6 +2009,110 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, return ret; } +/* + * Inform qgroup to trace subtree swap used in balance. + * + * Unlike btrfs_qgroup_trace_subtree(), this function will only trace + * new tree blocks whose generation is equal to (or larger than) @last_snapshot. + * + * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and + * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, + * and then go down @src_eb (pointed by @src_parent and @src_slot) to find + * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * and skip all tree blocks whose generation is smaller than last_snapshot. + * + * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), + * which could be the cause of very slow balance if the file tree is large. + * + * @src_parent, @src_slot: pointer to src (file tree) eb. + * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. + */ +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *dst_path = NULL; + struct btrfs_key first_key; + struct extent_buffer *src_eb = NULL; + struct extent_buffer *dst_eb = NULL; + u64 child_gen; + u64 child_bytenr; + int level; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + /* Check parameter order */ + if (btrfs_node_ptr_generation(src_parent, src_slot) > + btrfs_node_ptr_generation(dst_parent, dst_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, + btrfs_node_ptr_generation(src_parent, src_slot), + btrfs_node_ptr_generation(dst_parent, dst_slot)); + return -EUCLEAN; + } + + /* Read out real @src_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(src_parent, src_slot); + child_gen = btrfs_node_ptr_generation(src_parent, src_slot); + btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); + + src_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(src_parent) - 1, &first_key); + if (IS_ERR(src_eb)) { + ret = PTR_ERR(src_eb); + goto out; + } + + /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); + child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); + btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); + + dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(dst_parent) - 1, &first_key); + if (IS_ERR(dst_eb)) { + ret = PTR_ERR(dst_eb); + goto out; + } + + if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + ret = -EINVAL; + goto out; + } + + level = btrfs_header_level(dst_eb); + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + + /* For dst_path */ + extent_buffer_get(dst_eb); + dst_path->nodes[level] = dst_eb; + dst_path->slots[level] = 0; + dst_path->locks[level] = 0; + + /* Do the generation-aware breadth-first search */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, + level, last_snapshot); + if (ret < 0) + goto out; + ret = 0; + +out: + free_extent_buffer(src_eb); + free_extent_buffer(dst_eb); + btrfs_free_path(dst_path); + if (ret < 0) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 54b8bb282c0e..1aaf4c276900 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -236,6 +236,11 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); + +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a5c5e9b3aceb..d10357122aa1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1888,14 +1888,9 @@ int replace_path(struct btrfs_trans_handle *trans, * and tree block numbers, if current trans doesn't free * data reloc tree inode. */ - ret = btrfs_qgroup_trace_subtree(trans, parent, - btrfs_header_generation(parent), - btrfs_header_level(parent)); - if (ret < 0) - break; - ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level], - btrfs_header_generation(path->nodes[level]), - btrfs_header_level(path->nodes[level])); + ret = btrfs_qgroup_trace_subtree_swap(trans, parent, slot, + path->nodes[level], path->slots[level], + last_snapshot); if (ret < 0) break; From 2cd86d309bd1203241ceb833effe90787f3564a1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Sep 2018 14:42:33 +0800 Subject: [PATCH 66/93] btrfs: qgroup: Don't trace subtree if we're dropping reloc tree Reloc tree doesn't contribute to qgroup numbers, as we have accounted them at balance time (see replace_path()). Skipping the unneeded subtree tracing should reduce the overhead. [[Benchmark]] Hardware: VM 4G vRAM, 8 vCPUs, disk is using 'unsafe' cache mode, backing device is SAMSUNG 850 evo SSD. Host has 16G ram. Mkfs parameter: --nodesize 4K (To bump up tree size) Initial subvolume contents: 4G data copied from /usr and /lib. (With enough regular small files) Snapshots: 16 snapshots of the original subvolume. each snapshot has 3 random files modified. balance parameter: -m So the content should be pretty similar to a real world root fs layout. | v4.19-rc1 | w/ patchset | diff (*) --------------------------------------------------------------- relocated extents | 22929 | 22900 | -0.1% qgroup dirty extents | 227757 | 167139 | -26.6% time (sys) | 65.253s | 50.123s | -23.2% time (real) | 74.032s | 52.551s | -29.0% Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8798fa029ebe..d68ee91c00fc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8683,7 +8683,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, parent = 0; } - if (need_account) { + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have + * already accounted them at merge time (replace_path), + * thus we could skip expensive subtree trace here. + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + need_account) { ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { From 3d0174f78e72301324a5b0ba7d67676474e36fff Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Sep 2018 14:42:35 +0800 Subject: [PATCH 67/93] btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group For qgroup_trace_extent_swap(), if we find one leaf that needs to be traced, we will also iterate all file extents and trace them. This is OK if we're relocating data block groups, but if we're relocating metadata block groups, balance code itself has ensured that both subtree of file tree and reloc tree contain the same contents. That's to say, if we're relocating metadata block groups, all file extents in reloc and file tree should match, thus no need to trace them. This should reduce the total number of dirty extents processed in metadata block group balance. [[Benchmark]] (with all previous enhancement) Hardware: VM 4G vRAM, 8 vCPUs, disk is using 'unsafe' cache mode, backing device is SAMSUNG 850 evo SSD. Host has 16G ram. Mkfs parameter: --nodesize 4K (To bump up tree size) Initial subvolume contents: 4G data copied from /usr and /lib. (With enough regular small files) Snapshots: 16 snapshots of the original subvolume. each snapshot has 3 random files modified. balance parameter: -m So the content should be pretty similar to a real world root fs layout. | v4.19-rc1 | w/ patchset | diff (*) --------------------------------------------------------------- relocated extents | 22929 | 22851 | -0.3% qgroup dirty extents | 227757 | 140886 | -38.1% time (sys) | 65.253s | 37.464s | -42.6% time (real) | 74.032s | 44.722s | -39.6% Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 21 +++++++++++++++------ fs/btrfs/qgroup.h | 1 + fs/btrfs/relocation.c | 10 +++++----- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 6b35b3481085..ac9690f36a94 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1761,7 +1761,8 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, - int dst_level, int root_level) + int dst_level, int root_level, + bool trace_leaf) { struct btrfs_key key; struct btrfs_path *src_path; @@ -1863,7 +1864,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, goto out; /* Record leaf file extents */ - if (dst_level == 0) { + if (dst_level == 0 && trace_leaf) { ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) goto out; @@ -1900,7 +1901,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int cur_level, int root_level, - u64 last_snapshot) + u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *eb; @@ -1972,7 +1973,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, /* Now record this tree block and its counter part for qgroups */ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, - root_level); + root_level, trace_leaf); if (ret < 0) goto cleanup; @@ -1989,7 +1990,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, /* Recursive call (at most 7 times) */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, cur_level - 1, root_level, - last_snapshot); + last_snapshot, trace_leaf); if (ret < 0) goto cleanup; } @@ -2028,6 +2029,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. */ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, struct extent_buffer *src_parent, int src_slot, struct extent_buffer *dst_parent, int dst_slot, u64 last_snapshot) @@ -2037,6 +2039,7 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, struct btrfs_key first_key; struct extent_buffer *src_eb = NULL; struct extent_buffer *dst_eb = NULL; + bool trace_leaf = false; u64 child_gen; u64 child_bytenr; int level; @@ -2055,6 +2058,12 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return -EUCLEAN; } + /* + * Only trace leaf if we're relocating data block groups, this could + * reduce tons of data extents tracing for meta/sys bg relocation. + */ + if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) + trace_leaf = true; /* Read out real @src_eb, pointed by @src_parent and @src_slot */ child_bytenr = btrfs_node_blockptr(src_parent, src_slot); child_gen = btrfs_node_ptr_generation(src_parent, src_slot); @@ -2099,7 +2108,7 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, /* Do the generation-aware breadth-first search */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, - level, last_snapshot); + level, last_snapshot, trace_leaf); if (ret < 0) goto out; ret = 0; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 1aaf4c276900..80ebeb3ab5ba 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -238,6 +238,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, u64 root_gen, int root_level); int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, struct extent_buffer *src_parent, int src_slot, struct extent_buffer *dst_parent, int dst_slot, u64 last_snapshot); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d10357122aa1..3e6e3d93caad 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1744,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * errors, a negative error number is returned. */ static noinline_for_stack -int replace_path(struct btrfs_trans_handle *trans, +int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *dest, struct btrfs_root *src, struct btrfs_path *path, struct btrfs_key *next_key, int lowest_level, int max_level) @@ -1888,9 +1888,9 @@ int replace_path(struct btrfs_trans_handle *trans, * and tree block numbers, if current trans doesn't free * data reloc tree inode. */ - ret = btrfs_qgroup_trace_subtree_swap(trans, parent, slot, - path->nodes[level], path->slots[level], - last_snapshot); + ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group, + parent, slot, path->nodes[level], + path->slots[level], last_snapshot); if (ret < 0) break; @@ -2209,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; } else { - ret = replace_path(trans, root, reloc_root, path, + ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } if (ret < 0) { From f556faa46eb4e96d0d0772e74ecf66781e132f72 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 28 Sep 2018 07:59:34 +0800 Subject: [PATCH 68/93] btrfs: tree-checker: Check level for leaves and nodes Although we have tree level check at tree read runtime, it's completely based on its parent level. We still need to do accurate level check to avoid invalid tree blocks sneak into kernel space. The check itself is simple, for leaf its level should always be 0. For nodes its level should be in range [1, BTRFS_MAX_LEVEL - 1]. Signed-off-by: Qu Wenruo Reviewed-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index db835635372f..cab0b1f1f741 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, u32 nritems = btrfs_header_nritems(leaf); int slot; + if (btrfs_header_level(leaf) != 0) { + generic_err(fs_info, leaf, 0, + "invalid level for leaf, have %d expect 0", + btrfs_header_level(leaf)); + return -EUCLEAN; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; + int level = btrfs_header_level(node); u64 bytenr; int ret = 0; + if (level <= 0 || level >= BTRFS_MAX_LEVEL) { + generic_err(fs_info, node, 0, + "invalid level for node, have %d expect [1, %d]", + level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", From 3280f874576d31b03fe19cbcc23585d96feb4ceb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 24 Aug 2018 17:32:29 +0200 Subject: [PATCH 69/93] btrfs: remove btrfs_dev_replace::read_locks This member seems to be copied from the extent_buffer locking scheme and is at least used to assert that the read lock/unlock is properly nested. In some way. While the _inc/_dec are called inside the read lock section, the asserts are both inside and outside, so the ordering is not guaranteed and we can see read/inc/dec ordered in any way (theoretically). A missing call of btrfs_dev_replace_clear_lock_blocking could cause unexpected read_locks count, so this at least looks like a valid assertion, but this will become unnecessary with later updates. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/dev-replace.c | 5 ----- fs/btrfs/disk-io.c | 1 - 3 files changed, 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36ade217e462..daa273c7e150 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -361,7 +361,6 @@ struct btrfs_dev_replace { struct mutex lock_finishing_cancel_unmount; rwlock_t lock; - atomic_t read_locks; atomic_t blocking_readers; wait_queue_head_t read_lock_wq; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ff01740158aa..702fe6a65075 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -964,13 +964,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) { read_lock(&dev_replace->lock); - atomic_inc(&dev_replace->read_locks); } void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) { - ASSERT(atomic_read(&dev_replace->read_locks) > 0); - atomic_dec(&dev_replace->read_locks); read_unlock(&dev_replace->lock); } @@ -997,7 +994,6 @@ void btrfs_dev_replace_set_lock_blocking( struct btrfs_dev_replace *dev_replace) { /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->read_locks) > 0); atomic_inc(&dev_replace->blocking_readers); read_unlock(&dev_replace->lock); } @@ -1007,7 +1003,6 @@ void btrfs_dev_replace_clear_lock_blocking( struct btrfs_dev_replace *dev_replace) { /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->read_locks) > 0); ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); read_lock(&dev_replace->lock); /* Barrier implied by atomic_dec_and_test */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 517484ecad96..79903243877b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2155,7 +2155,6 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.read_locks, 0); atomic_set(&fs_info->dev_replace.blocking_readers, 0); init_waitqueue_head(&fs_info->replace_wait); init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); From 7fb2eced105f67676eb86473d5b1ce6a96f6eab4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 24 Aug 2018 17:33:58 +0200 Subject: [PATCH 70/93] btrfs: open code btrfs_dev_replace_clear_lock_blocking There's a single caller and the function name does not say it's actually taking the lock, so open coding makes it more explicit. For now, btrfs_dev_replace_read_lock is used instead of read_lock so it's paired with the unlocking wrapper in the same block. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 12 ------------ fs/btrfs/dev-replace.h | 2 -- fs/btrfs/volumes.c | 6 +++++- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 702fe6a65075..a79e8d67f768 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -998,18 +998,6 @@ void btrfs_dev_replace_set_lock_blocking( read_unlock(&dev_replace->lock); } -/* acquire read lock and dec blocking cnt */ -void btrfs_dev_replace_clear_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - read_lock(&dev_replace->lock); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { percpu_counter_inc(&fs_info->bio_counter); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index b6d4206188bb..39f022c457ff 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -28,8 +28,6 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_clear_lock_blocking( - struct btrfs_dev_replace *dev_replace); static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 26eb388db343..f435d397019e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5904,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - btrfs_dev_replace_clear_lock_blocking(dev_replace); + ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); + btrfs_dev_replace_read_lock(dev_replace); + /* Barrier implied by atomic_dec_and_test */ + if (atomic_dec_and_test(&dev_replace->blocking_readers)) + cond_wake_up_nomb(&dev_replace->read_lock_wq); btrfs_dev_replace_read_unlock(dev_replace); } free_extent_map(em); From e37abe9725bf45b292d5653cc02bbd791773c205 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Apr 2018 17:20:52 +0200 Subject: [PATCH 71/93] btrfs: open code btrfs_dev_replace_stats_inc The wrapper is too trivial, open coding does not make it less readable. Reviewed-by: Omar Sandoval Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.h | 5 ----- fs/btrfs/scrub.c | 11 ++++------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 39f022c457ff..1c91bf9ef39a 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -29,9 +29,4 @@ void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); -static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) -{ - atomic64_inc(stat_value); -} - #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4bcc275f7612..902819d3cf41 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (scrub_write_page_to_dev_replace(sblock_other, page_num) != 0) { - btrfs_dev_replace_stats_inc( + atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } @@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) ret = scrub_write_page_to_dev_replace(sblock, page_num); if (ret) - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); } } @@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) struct scrub_page *spage = sbio->pagev[i]; spage->io_error = 1; - btrfs_dev_replace_stats_inc(&dev_replace-> - num_write_errors); + atomic64_inc(&dev_replace->num_write_errors); } } From 9f6cbcbb09d0f2a73ccb9998f6ac34606da9c938 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 24 Aug 2018 17:41:17 +0200 Subject: [PATCH 72/93] btrfs: open code btrfs_after_dev_replace_commit Too trivial, the purpose can be simply documented in a comment. Reviewed-by: Omar Sandoval Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 -------- fs/btrfs/dev-replace.h | 1 - fs/btrfs/transaction.c | 5 ++++- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a79e8d67f768..264105a26e7e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -382,14 +382,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, return ret; } -void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - - dev_replace->committed_cursor_left = - dev_replace->cursor_left_last_write_of_item; -} - static char* btrfs_dev_name(struct btrfs_device *device) { if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 1c91bf9ef39a..795c551f5b5e 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b1cc978d409..cadc747292d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1198,7 +1198,10 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); - btrfs_after_dev_replace_commit(fs_info); + + /* Update dev-replace pointer once everything is committed */ + fs_info->dev_replace.committed_cursor_left = + fs_info->dev_replace.cursor_left_last_write_of_item; return 0; } From aa144bfeaa7f87c536ab323edfe2692285343e68 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 24 Aug 2018 17:44:05 +0200 Subject: [PATCH 73/93] btrfs: dev-replace: avoid useless lock on error handling path The exit sequence in btrfs_dev_replace_start does not allow to simply add a label to the right place so the error handling after starting transaction failure jumps there. Currently there's a lock that pairs with the unlock in the section, which is unnecessary and only raises questions. Add a variable to track the locking status and avoid the extra locking. Reviewed-by: Omar Sandoval Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 264105a26e7e..c7f2d6b91a6f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -400,6 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + bool need_unlock; src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name); @@ -424,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } + need_unlock = true; btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -463,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); btrfs_dev_replace_write_unlock(dev_replace); + need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -474,6 +477,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + need_unlock = true; btrfs_dev_replace_write_lock(dev_replace); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; @@ -500,7 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - btrfs_dev_replace_write_unlock(dev_replace); + if (need_unlock) + btrfs_dev_replace_write_unlock(dev_replace); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } From 7f8d236ae132a8886a0186008c828e21f7460474 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 5 Apr 2018 01:04:49 +0200 Subject: [PATCH 74/93] btrfs: dev-replace: move replace members out of fs_info The replace_wait and bio_counter were mistakenly added to fs_info in commit c404e0dc2c843b154f ("Btrfs: fix use-after-free in the finishing procedure of the device replace"), but they logically belong to fs_info::dev_replace. Besides, bio_counter is a very generic name and is confusing in bare fs_info context. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/dev-replace.c | 16 ++++++++-------- fs/btrfs/disk-io.c | 9 +++++---- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index daa273c7e150..e8244c0b0597 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -365,6 +365,9 @@ struct btrfs_dev_replace { wait_queue_head_t read_lock_wq; struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; }; /* For raid type sysfs entries */ @@ -1087,9 +1090,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; - struct semaphore uuid_tree_rescan_sem; /* Used to reclaim the metadata space in the background. */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c7f2d6b91a6f..d1ea76bccaaf 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - wait_event(fs_info->replace_wait, !percpu_counter_sum( - &fs_info->bio_counter)); + wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( + &fs_info->dev_replace.bio_counter)); } /* @@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->dev_replace.replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -997,25 +997,25 @@ void btrfs_dev_replace_set_lock_blocking( void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { - percpu_counter_inc(&fs_info->bio_counter); + percpu_counter_inc(&fs_info->dev_replace.bio_counter); } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { - percpu_counter_sub(&fs_info->bio_counter, amount); - cond_wake_up_nomb(&fs_info->replace_wait); + percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); + cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); } void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { while (1) { - percpu_counter_inc(&fs_info->bio_counter); + percpu_counter_inc(&fs_info->dev_replace.bio_counter); if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state))) break; btrfs_bio_counter_dec(fs_info); - wait_event(fs_info->replace_wait, + wait_event(fs_info->dev_replace.replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 79903243877b..27f6a3348f94 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2156,7 +2156,7 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); rwlock_init(&fs_info->dev_replace.lock); atomic_set(&fs_info->dev_replace.blocking_readers, 0); - init_waitqueue_head(&fs_info->replace_wait); + init_waitqueue_head(&fs_info->dev_replace.replace_wait); init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } @@ -2646,7 +2646,8 @@ int open_ctree(struct super_block *sb, goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; @@ -3307,7 +3308,7 @@ int open_ctree(struct super_block *sb, iput(fs_info->btree_inode); fail_bio_counter: - percpu_counter_destroy(&fs_info->bio_counter); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: @@ -4016,7 +4017,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->bio_counter); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); From 9b142115ed3593480b813a9331593e9f199da340 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 5 Apr 2018 01:19:50 +0200 Subject: [PATCH 75/93] btrfs: dev-replace: remove pointless assert in write unlock The value of blocking_readers is increased only when the lock is taken for read, no way we can fail the condition with the write lock. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d1ea76bccaaf..2aa48aecc52b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -982,7 +982,6 @@ void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) { - ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); write_unlock(&dev_replace->lock); } From 523983401644ebeb331c923c28c9591c07430a7d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 22 Aug 2018 05:54:37 +0800 Subject: [PATCH 76/93] Btrfs: kill btrfs_clear_path_blocking Btrfs's btree locking has two modes, spinning mode and blocking mode, while searching btree, locking is always acquired in spinning mode and then converted to blocking mode if necessary, and in some hot paths we may switch the locking back to spinning mode by btrfs_clear_path_blocking(). When acquiring locks, both of reader and writer need to wait for blocking readers and writers to complete before doing read_lock()/write_lock(). The problem is that btrfs_clear_path_blocking() needs to switch nodes in the path to blocking mode at first (by btrfs_set_path_blocking) to make lockdep happy before doing its actual clearing blocking job. When switching to blocking mode from spinning mode, it consists of step 1) bumping up blocking readers counter and step 2) read_unlock()/write_unlock(), this has caused serious ping-pong effect if there're a great amount of concurrent readers/writers, as waiters will be woken up and go to sleep immediately. 1) Killing this kind of ping-pong results in a big improvement in my 1600k files creation script, MNT=/mnt/btrfs mkfs.btrfs -f /dev/sdf mount /dev/def $MNT time fsmark -D 10000 -S0 -n 100000 -s 0 -L 1 -l /tmp/fs_log.txt \ -d $MNT/0 -d $MNT/1 \ -d $MNT/2 -d $MNT/3 \ -d $MNT/4 -d $MNT/5 \ -d $MNT/6 -d $MNT/7 \ -d $MNT/8 -d $MNT/9 \ -d $MNT/10 -d $MNT/11 \ -d $MNT/12 -d $MNT/13 \ -d $MNT/14 -d $MNT/15 w/o patch: real 2m27.307s user 0m12.839s sys 13m42.831s w/ patch: real 1m2.273s user 0m15.802s sys 8m16.495s 1.1) latency histogram from funclatency[1] Overall with the patch, there're ~50% less write lock acquisition and the 95% max latency that write lock takes also reduces to ~100ms from >500ms. -------------------------------------------- w/o patch: -------------------------------------------- Function = btrfs_tree_lock msecs : count distribution 0 -> 1 : 2385222 |****************************************| 2 -> 3 : 37147 | | 4 -> 7 : 20452 | | 8 -> 15 : 13131 | | 16 -> 31 : 3877 | | 32 -> 63 : 3900 | | 64 -> 127 : 2612 | | 128 -> 255 : 974 | | 256 -> 511 : 165 | | 512 -> 1023 : 13 | | Function = btrfs_tree_read_lock msecs : count distribution 0 -> 1 : 6743860 |****************************************| 2 -> 3 : 2146 | | 4 -> 7 : 190 | | 8 -> 15 : 38 | | 16 -> 31 : 4 | | -------------------------------------------- w/ patch: -------------------------------------------- Function = btrfs_tree_lock msecs : count distribution 0 -> 1 : 1318454 |****************************************| 2 -> 3 : 6800 | | 4 -> 7 : 3664 | | 8 -> 15 : 2145 | | 16 -> 31 : 809 | | 32 -> 63 : 219 | | 64 -> 127 : 10 | | Function = btrfs_tree_read_lock msecs : count distribution 0 -> 1 : 6854317 |****************************************| 2 -> 3 : 2383 | | 4 -> 7 : 601 | | 8 -> 15 : 92 | | 2) dbench also proves the improvement, dbench -t 120 -D /mnt/btrfs 16 w/o patch: Throughput 158.363 MB/sec w/ patch: Throughput 449.52 MB/sec 3) xfstests didn't show any additional failures. One thing to note is that callers may set path->leave_spinning to have all nodes in the path stay in spinning mode, which means callers are ready to not sleep before releasing the path, but it won't cause problems if they don't want to sleep in blocking mode. [1]: https://github.com/iovisor/bcc/blob/master/tools/funclatency.py Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 57 +++------------------------------------- fs/btrfs/ctree.h | 2 -- fs/btrfs/delayed-inode.c | 3 --- 3 files changed, 4 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0a6c645fab0a..2ee43b6a4f09 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) } } -/* - * reset all the locked nodes in the patch to spinning locks. - * - * held is used to keep lockdep happy, when lockdep is enabled - * we set held to a blocking lock before we go around and - * retake all the spinlocks in the path. You can safely use NULL - * for held - */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw) -{ - int i; - - if (held) { - btrfs_set_lock_blocking_rw(held, held_rw); - if (held_rw == BTRFS_WRITE_LOCK) - held_rw = BTRFS_WRITE_LOCK_BLOCKING; - else if (held_rw == BTRFS_READ_LOCK) - held_rw = BTRFS_READ_LOCK_BLOCKING; - } - btrfs_set_path_blocking(p); - - for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) { - btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) - p->locks[i] = BTRFS_WRITE_LOCK; - else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) - p->locks[i] = BTRFS_READ_LOCK; - } - } - - if (held) - btrfs_clear_lock_blocking_rw(held, held_rw); -} - /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } } - btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -2482,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -2503,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -2788,7 +2749,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } cow_done: p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); + /* + * Leave path with blocking locks to avoid massive + * lock context switch, this is made on purpose. + */ /* * we have a lock on b and as long as we aren't changing @@ -2870,8 +2834,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_WRITE_LOCK); } p->locks[level] = BTRFS_WRITE_LOCK; } else { @@ -2879,8 +2841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); } p->locks[level] = BTRFS_READ_LOCK; } @@ -2899,7 +2859,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -2970,7 +2929,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, while (b) { level = btrfs_header_level(b); p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -3016,8 +2974,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); } b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { @@ -5201,7 +5157,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1, 0, NULL); - btrfs_clear_path_blocking(path, NULL, 0); } out: path->keep_locks = keep_locks; @@ -5786,8 +5741,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5823,8 +5776,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e8244c0b0597..15c659f23411 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2868,8 +2868,6 @@ void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 15dcbd6ef531..c669f250d4a0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -765,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, i++; } - /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL, 0); - /* insert the keys of the items */ setup_items_for_insert(root, path, keys, data_size, total_data_size, total_size, nitems); From f45c752b65af46bf42963295c332865d95f97fff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:17:48 -0400 Subject: [PATCH 77/93] btrfs: release metadata before running delayed refs We want to release the unused reservation we have since it refills the delayed refs reserve, which will make everything go smoother when running the delayed refs if we're short on our reservation. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Omar Sandoval Reviewed-by: Liu Bo Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cadc747292d9..e7f618b17b07 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1932,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } + btrfs_trans_release_metadata(trans); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ @@ -1941,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - cur_trans = trans->transaction; /* From 84de76a2fb217dc1b6bc2965cc397d1648aa1404 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:17:49 -0400 Subject: [PATCH 78/93] btrfs: protect space cache inode alloc with GFP_NOFS If we're allocating a new space cache inode it's likely going to be under a transaction handle, so we need to use memalloc_nofs_save() in order to avoid deadlocks, and more importantly lockdep messages that make xfstests fail. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Omar Sandoval Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ed097ff023e8..a1f379e8ad13 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct inode *inode = NULL; + unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_disk_key_to_cpu(&location, &disk_key); btrfs_release_path(path); + /* + * We are often under a trans handle at this point, so we need to make + * sure NOFS is set to keep us from deadlocking. + */ + nofs_flag = memalloc_nofs_save(); inode = btrfs_iget(fs_info->sb, &location, root, NULL); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return inode; From 553cceb49681d60975d00892877d4c871bf220f9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:18:00 -0400 Subject: [PATCH 79/93] btrfs: reset max_extent_size on clear in a bitmap We need to clear the max_extent_size when we clear bits from a bitmap since it could have been from the range that contains the max_extent_size. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a1f379e8ad13..67441219d6c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1687,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; + if (info->max_extent_size > ctl->unit) + info->max_extent_size = 0; } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, From 545e3366db823dc3342ca9d7fea803f829c9062f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:18:02 -0400 Subject: [PATCH 80/93] btrfs: make sure we create all new block groups Allocating new chunks modifies both the extent and chunk tree, which can trigger new chunk allocations. So instead of doing list_for_each_safe, just do while (!list_empty()) so we make sure we don't exit with other pending bg's still on our list. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Omar Sandoval Reviewed-by: Liu Bo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d68ee91c00fc..c4a1e8f028d3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10128,7 +10128,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group_cache *block_group; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; struct btrfs_key key; @@ -10136,7 +10136,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) bool can_flush_pending_bgs = trans->can_flush_pending_bgs; trans->can_flush_pending_bgs = false; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, + bg_list); if (ret) goto next; From e187831e1875b2754a3ba3c683bd277d4a5f7be8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:18:03 -0400 Subject: [PATCH 81/93] btrfs: assert on non-empty delayed iputs I ran into an issue where there was some reference being held on an inode that I couldn't track. This assert wasn't triggered, but it at least rules out we're doing something stupid. Reviewed-by: Omar Sandoval Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 27f6a3348f94..b0ab41da91d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3976,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); + ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); From ad80cf50c3f09a88eed918feeb95edaaf8d72b0a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:18:19 -0400 Subject: [PATCH 82/93] btrfs: drop min_size from evict_refill_and_join We don't need it, rsv->size is set once and never changes throughout its lifetime, so just use that for the reserve size. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ae74c740f3e1..6c476dc81b8e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5292,8 +5292,7 @@ static void evict_inode_truncate_pages(struct inode *inode) } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - u64 min_size) + struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -5303,7 +5302,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_trans_handle *trans; int ret; - ret = btrfs_block_rsv_refill(root, rsv, min_size, + ret = btrfs_block_rsv_refill(root, rsv, rsv->size, BTRFS_RESERVE_FLUSH_LIMIT); if (ret && ++failures > 2) { @@ -5321,7 +5320,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * it. */ if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, false)) + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) return trans; /* If not, commit and try again. */ @@ -5337,7 +5336,6 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - u64 min_size; int ret; trace_btrfs_inode_evict(inode); @@ -5347,8 +5345,6 @@ void btrfs_evict_inode(struct inode *inode) return; } - min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); - evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5378,13 +5374,13 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = min_size; + rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; @@ -5409,7 +5405,7 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); From f2d72f42d5fa3bf33761d9e47201745f624fcff5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Oct 2018 11:12:55 +0100 Subject: [PATCH 83/93] Btrfs: fix warning when replaying log after fsync of a tmpfile When replaying a log which contains a tmpfile (which necessarily has a link count of 0) we end up calling inc_nlink(), at fs/btrfs/tree-log.c:replay_one_buffer(), which produces a warning like the following: [195191.943673] WARNING: CPU: 0 PID: 6924 at fs/inode.c:342 inc_nlink+0x33/0x40 [195191.943723] CPU: 0 PID: 6924 Comm: mount Not tainted 4.19.0-rc6-btrfs-next-38 #1 [195191.943724] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [195191.943726] RIP: 0010:inc_nlink+0x33/0x40 [195191.943728] RSP: 0018:ffffb96e425e3870 EFLAGS: 00010246 [195191.943730] RAX: 0000000000000000 RBX: ffff8c0d1e6af4f0 RCX: 0000000000000006 [195191.943731] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8c0d1e6af4f0 [195191.943731] RBP: 0000000000000097 R08: 0000000000000001 R09: 0000000000000000 [195191.943732] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb96e425e3a60 [195191.943733] R13: ffff8c0d10cff0c8 R14: ffff8c0d0d515348 R15: ffff8c0d78a1b3f8 [195191.943735] FS: 00007f570ee24480(0000) GS:ffff8c0dfb200000(0000) knlGS:0000000000000000 [195191.943736] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [195191.943737] CR2: 00005593286277c8 CR3: 00000000bb8f2006 CR4: 00000000003606f0 [195191.943739] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [195191.943740] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [195191.943741] Call Trace: [195191.943778] replay_one_buffer+0x797/0x7d0 [btrfs] [195191.943802] walk_up_log_tree+0x1c1/0x250 [btrfs] [195191.943809] ? rcu_read_lock_sched_held+0x3f/0x70 [195191.943825] walk_log_tree+0xae/0x1d0 [btrfs] [195191.943840] btrfs_recover_log_trees+0x1d7/0x4d0 [btrfs] [195191.943856] ? replay_dir_deletes+0x280/0x280 [btrfs] [195191.943870] open_ctree+0x1c3b/0x22a0 [btrfs] [195191.943887] btrfs_mount_root+0x6b4/0x800 [btrfs] [195191.943894] ? rcu_read_lock_sched_held+0x3f/0x70 [195191.943899] ? pcpu_alloc+0x55b/0x7c0 [195191.943906] ? mount_fs+0x3b/0x140 [195191.943908] mount_fs+0x3b/0x140 [195191.943912] ? __init_waitqueue_head+0x36/0x50 [195191.943916] vfs_kern_mount+0x62/0x160 [195191.943927] btrfs_mount+0x134/0x890 [btrfs] [195191.943936] ? rcu_read_lock_sched_held+0x3f/0x70 [195191.943938] ? pcpu_alloc+0x55b/0x7c0 [195191.943943] ? mount_fs+0x3b/0x140 [195191.943952] ? btrfs_remount+0x570/0x570 [btrfs] [195191.943954] mount_fs+0x3b/0x140 [195191.943956] ? __init_waitqueue_head+0x36/0x50 [195191.943960] vfs_kern_mount+0x62/0x160 [195191.943963] do_mount+0x1f9/0xd40 [195191.943967] ? memdup_user+0x4b/0x70 [195191.943971] ksys_mount+0x7e/0xd0 [195191.943974] __x64_sys_mount+0x21/0x30 [195191.943977] do_syscall_64+0x60/0x1b0 [195191.943980] entry_SYSCALL_64_after_hwframe+0x49/0xbe [195191.943983] RIP: 0033:0x7f570e4e524a [195191.943986] RSP: 002b:00007ffd83589478 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [195191.943989] RAX: ffffffffffffffda RBX: 0000563f335b2060 RCX: 00007f570e4e524a [195191.943990] RDX: 0000563f335b2240 RSI: 0000563f335b2280 RDI: 0000563f335b2260 [195191.943992] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000020 [195191.943993] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000563f335b2260 [195191.943994] R13: 0000563f335b2240 R14: 0000000000000000 R15: 00000000ffffffff [195191.944002] irq event stamp: 8688 [195191.944010] hardirqs last enabled at (8687): [] console_unlock+0x503/0x640 [195191.944012] hardirqs last disabled at (8688): [] trace_hardirqs_off_thunk+0x1a/0x1c [195191.944018] softirqs last enabled at (8638): [] __set_page_dirty_nobuffers+0x101/0x150 [195191.944020] softirqs last disabled at (8634): [] wb_wakeup_delayed+0x2e/0x60 [195191.944022] ---[ end trace 5d6e873a9a0b811a ]--- This happens because the inode does not have the flag I_LINKABLE set, which is a runtime only flag, not meant to be persisted, set when the inode is created through open(2) if the flag O_EXCL is not passed to it. Except for the warning, there are no other consequences (like corruptions or metadata inconsistencies). Since it's pointless to replay a tmpfile as it would be deleted in a later phase of the log replay procedure (it has a link count of 0), fix this by not logging tmpfiles and if a tmpfile is found in a log (created by a kernel without this change), skip the replay of the inode. A test case for fstests follows soon. Fixes: 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") CC: stable@vger.kernel.org # 4.18+ Reported-by: Martin Steigerwald Link: https://lore.kernel.org/linux-btrfs/3666619.NTnn27ZJZE@merkaba/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c7914f293cb6..a5e08a73653e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -255,6 +255,13 @@ struct walk_control { /* what stage of the replay code we're currently in */ int stage; + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in + * the LOG_WALK_REPLAY_INODES stage. + */ + bool ignore_cur_inode; + /* the root we are currently replaying */ struct btrfs_root *replay_dest; @@ -2484,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + /* + * If we have a tmpfile (O_TMPFILE) that got fsync'ed + * and never got linked before the fsync, skip it, as + * replaying it is pointless since it would be deleted + * later. We skip logging tmpfiles, but it's always + * possible we are replaying a log created with a kernel + * that used to log tmpfiles. + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; + continue; + } else { + wc->ignore_cur_inode = false; + } ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) @@ -2521,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root->fs_info->sectorsize); ret = btrfs_drop_extents(wc->trans, root, inode, from, (u64)-1, 1); - /* - * If the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done. - */ if (!ret) { - if (inode->i_nlink == 0) - inc_nlink(inode); - /* Update link count and nbytes. */ + /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); } @@ -2545,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; } + if (wc->ignore_cur_inode) + continue; + if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { ret = replay_one_dir_item(wc->trans, root, path, @@ -5640,7 +5656,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (btrfs_inode_in_log(inode, trans->transid)) { + /* + * Skip already logged inodes or inodes corresponding to tmpfiles + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ + if (btrfs_inode_in_log(inode, trans->transid) || + inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } From 0f375eed92b5a407657532637ed9652611a682f5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 9 Oct 2018 15:05:29 +0100 Subject: [PATCH 84/93] Btrfs: fix wrong dentries after fsync of file that got its parent replaced In a scenario like the following: mkdir /mnt/A # inode 258 mkdir /mnt/B # inode 259 touch /mnt/B/bar # inode 260 sync mv /mnt/B/bar /mnt/A/bar mv -T /mnt/A /mnt/B fsync /mnt/B/bar After replaying the log we end up with file bar having 2 hard links, both with the name 'bar' and one in the directory with inode number 258 and the other in the directory with inode number 259. Also, we end up with the directory inode 259 still existing and with the directory inode 258 still named as 'A', instead of 'B'. In this scenario, file 'bar' should only have one hard link, located at directory inode 258, the directory inode 259 should not exist anymore and the name for directory inode 258 should be 'B'. This incorrect behaviour happens because when attempting to log the old parents of an inode, we skip any parents that no longer exist. Fix this by forcing a full commit if an old parent no longer exists. A test case for fstests follows soon. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a5e08a73653e..0dba09334a16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5580,9 +5580,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, dir_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); - /* If parent inode was deleted, skip it. */ - if (IS_ERR(dir_inode)) - continue; + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } if (ctx) ctx->log_new_dentries = false; From 3628b4ca64f24a4ec55055597d0cb1c814729f8b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 9 Oct 2018 14:36:45 +0800 Subject: [PATCH 85/93] btrfs: qgroup: Avoid calling qgroup functions if qgroup is not enabled Some qgroup trace events like btrfs_qgroup_release_data() and btrfs_qgroup_free_delayed_ref() can still be triggered even if qgroup is not enabled. This is caused by the lack of qgroup status check before calling some qgroup functions. Thankfully the functions can handle quota disabled case well and just do nothing for qgroup disabled case. This patch will do earlier check before triggering related trace events. And for enabled <-> disabled race case: 1) For enabled->disabled case Disable will wipe out all qgroups data including reservation and excl/rfer. Even if we leak some reservation or numbers, it will still be cleared, so nothing will go wrong. 2) For disabled -> enabled case Current btrfs_qgroup_release_data() will use extent_io tree to ensure we won't underflow reservation. And for delayed_ref we use head->qgroup_reserved to record the reserved space, so in that case head->qgroup_reserved should be 0 and we won't underflow. CC: stable@vger.kernel.org # 4.14+ Reported-by: Chris Murphy Link: https://lore.kernel.org/linux-btrfs/CAJCQCtQau7DtuUUeycCkZ36qjbKuxNzsgqJ7+sJ6W0dK_NLE3w@mail.gmail.com/ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 ++++ fs/btrfs/qgroup.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ac9690f36a94..27f517315388 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3521,6 +3521,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, + &BTRFS_I(inode)->root->fs_info->flags)) + return 0; + /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 80ebeb3ab5ba..d8f78f5ab854 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -255,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return; trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, BTRFS_QGROUP_RSV_DATA); From 98ff7b94e4daca8d7ea4352a9a7905cced597a4c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 21 Sep 2018 15:20:29 +0800 Subject: [PATCH 86/93] btrfs: relocation: Cleanup while loop using rbtree_postorder_for_each_entry_safe And add one line comment explaining what we're doing for each loop. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3e6e3d93caad..0c1bc7df06a2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2991,7 +2991,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct backref_node *node; struct btrfs_path *path; struct tree_block *block; - struct rb_node *rb_node; + struct tree_block *next; int ret; int err = 0; @@ -3001,29 +3001,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, goto out_free_blocks; } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); + /* Kick in readahead for tree blocks with missing keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) readahead_tree_block(fs_info, block->bytenr); - rb_node = rb_next(rb_node); } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); + /* Get first keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { err = get_tree_block_key(fs_info, block); if (err) goto out_free_path; } - rb_node = rb_next(rb_node); } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - + /* Do tree relocation */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -3034,11 +3028,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - if (ret != -EAGAIN || rb_node == rb_first(blocks)) + if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) err = ret; goto out; } - rb_node = rb_next(rb_node); } out: err = finish_pending_nodes(trans, rc, path, err); From 06bbf67244fc63c66340bd31f4b2b60d797e886e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 21 Sep 2018 15:20:30 +0800 Subject: [PATCH 87/93] btrfs: relocation: Remove redundant tree level check Commit 581c1760415c ("btrfs: Validate child tree block's level and first key") has made tree block level check mandatory. So if tree block level doesn't match, we won't get a valid extent buffer. The extra WARN_ON() check can be removed completely. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0c1bc7df06a2..924116f654a1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2915,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); else From b90e22ba4804d31160f6cd625efa599ba9d286af Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 11 Oct 2018 13:42:56 +0800 Subject: [PATCH 88/93] btrfs: qgroup: move the qgroup->members check out from (!qgroup)'s else branch There is no reason to put this check in (!qgroup)'s else branch because if qgroup is null, it will goto out directly. So move it out to reduce indentation level. No functional change. Signed-off-by: Lu Fengqi Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 27f517315388..45868fd76209 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (!qgroup) { ret = -ENOENT; goto out; - } else { - /* check if there are no children of this qgroup */ - if (!list_empty(&qgroup->members)) { - ret = -EBUSY; - goto out; - } } + + /* Check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; From 5637c74b01458d4bc392c2bb721bd102f316ad2d Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 11 Oct 2018 13:40:33 +0800 Subject: [PATCH 89/93] btrfs: delayed-ref: pass delayed_refs directly to btrfs_select_ref_head Since trans is only used for referring to delayed_refs, there is no need to pass it instead of delayed_refs to btrfs_select_ref_head(). No functional change. Signed-off-by: Lu Fengqi Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 7 ++----- fs/btrfs/delayed-ref.h | 4 ++-- fs/btrfs/extent-tree.c | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 7f567c944fec..59a8c0565a24 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -353,16 +353,13 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) return ret; } -struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans) +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs) { - struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; u64 start; bool loop = false; - delayed_refs = &trans->transaction->delayed_refs; - again: start = delayed_refs->run_delayed_start; head = find_ref_head(delayed_refs, start, 1); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c3e3486a126c..d664c99ce716 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) } -struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans); +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c4a1e8f028d3..5141fe67680b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2511,7 +2511,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( int ret; spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(trans); + head = btrfs_select_ref_head(delayed_refs); if (!head) { spin_unlock(&delayed_refs->lock); return head; From 9e920a6f03e40b1eb712f38b29ad5880153754e2 Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 11 Oct 2018 13:40:34 +0800 Subject: [PATCH 90/93] btrfs: delayed-ref: pass delayed_refs directly to btrfs_delayed_ref_lock Since trans is only used for referring to delayed_refs, there is no need to pass it instead of delayed_refs to btrfs_delayed_ref_lock(). No functional change. Reviewed-by: Nikolay Borisov Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 5 +---- fs/btrfs/delayed-ref.h | 2 +- fs/btrfs/extent-tree.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 59a8c0565a24..5d8f6d505d5a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -204,12 +204,9 @@ static struct btrfs_delayed_ref_head* find_ref_head( return NULL; } -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d664c99ce716..8e20c5cb5404 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5141fe67680b..3ee5d84cd817 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2521,7 +2521,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( * Grab the lock that says we are going to process all the refs for * this head */ - ret = btrfs_delayed_ref_lock(trans, head); + ret = btrfs_delayed_ref_lock(delayed_refs, head); spin_unlock(&delayed_refs->lock); /* From af9b8a0e2085fc90dca85acd85ee83ece7c05130 Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 11 Oct 2018 13:40:35 +0800 Subject: [PATCH 91/93] btrfs: remove fs_info from btrfs_check_space_for_delayed_refs It can be referenced from the transaction handle. Signed-off-by: Lu Fengqi Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/inode.c | 2 +- fs/btrfs/transaction.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 15c659f23411..4002c9fd924b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2600,8 +2600,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ee5d84cd817..c7de1e720f56 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2789,9 +2789,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; @@ -2842,7 +2842,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans, fs_info); + return btrfs_check_space_for_delayed_refs(trans); } struct async_delayed_refs { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6c476dc81b8e..6a5557e8909d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5319,7 +5319,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && + if (!btrfs_check_space_for_delayed_refs(trans) && !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) return trans; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e7f618b17b07..c5015458c5c8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -760,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); From 7c8616278b19c42141ff4617573cbf950f4a456b Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 11 Oct 2018 13:40:36 +0800 Subject: [PATCH 92/93] btrfs: remove fs_info from btrfs_should_throttle_delayed_refs The avg_delayed_ref_runtime can be referenced from the transaction handle. Signed-off-by: Lu Fengqi Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent-tree.c | 5 ++--- fs/btrfs/inode.c | 5 ++--- fs/btrfs/transaction.c | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4002c9fd924b..68ca41dbbef3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2598,8 +2598,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c7de1e720f56..a4cd0221bc8d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2826,8 +2826,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) return ret; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) { u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); @@ -2835,7 +2834,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 val; smp_mb(); - avg_runtime = fs_info->avg_delayed_ref_runtime; + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; val = num_entries * avg_runtime; if (val >= NSEC_PER_SEC) return 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6a5557e8909d..f22f77172c5f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4713,7 +4713,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans, fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, trans->transid, 0); @@ -4722,8 +4722,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, extent_num_bytes)) { should_end = true; } - if (btrfs_should_throttle_delayed_refs(trans, - fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c5015458c5c8..5686290a50e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -835,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans, info); + btrfs_should_throttle_delayed_refs(trans); cur = max_t(unsigned long, cur, 32); /* From d9352794dad9f28535439d85a815978878c141ab Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 11 Oct 2018 13:40:38 +0800 Subject: [PATCH 93/93] btrfs: switch return_bigger to bool in find_ref_head Using bool is more suitable than int here, and add the comment about the return_bigger. Signed-off-by: Lu Fengqi Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 5d8f6d505d5a..5149165b49a4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -168,11 +168,12 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. + * match is found. But if no bigger one is found then the first node of the + * ref head tree will be returned. */ static struct btrfs_delayed_ref_head* find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, - int return_bigger) + bool return_bigger) { struct rb_root *root = &dr->href_root.rb_root; struct rb_node *n; @@ -359,12 +360,12 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( again: start = delayed_refs->run_delayed_start; - head = find_ref_head(delayed_refs, start, 1); + head = find_ref_head(delayed_refs, start, true); if (!head && !loop) { delayed_refs->run_delayed_start = 0; start = 0; loop = true; - head = find_ref_head(delayed_refs, start, 1); + head = find_ref_head(delayed_refs, start, true); if (!head) return NULL; } else if (!head && loop) { @@ -905,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { - return find_ref_head(delayed_refs, bytenr, 0); + return find_ref_head(delayed_refs, bytenr, false); } void __cold btrfs_delayed_ref_exit(void)