diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ae750b1574a2..68ebe188446a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, } struct preftree { - struct rb_root root; + struct rb_root_cached root; unsigned int count; }; -#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 } +#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 } struct preftrees { struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */ @@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct prelim_ref *newref, struct share_check *sc) { - struct rb_root *root; + struct rb_root_cached *root; struct rb_node **p; struct rb_node *parent = NULL; struct prelim_ref *ref; int result; + bool leftmost = true; root = &preftree->root; - p = &root->rb_node; + p = &root->rb_root.rb_node; while (*p) { parent = *p; @@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, p = &(*p)->rb_left; } else if (result > 0) { p = &(*p)->rb_right; + leftmost = false; } else { /* Identical refs, merge them and free @newref */ struct extent_inode_elem *eie = ref->inode_list; @@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); - rb_insert_color(&newref->rbnode, root); + rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree) { struct prelim_ref *ref, *next_ref; - rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root, - rbnode) + rbtree_postorder_for_each_entry_safe(ref, next_ref, + &preftree->root.rb_root, rbnode) free_pref(ref); - preftree->root = RB_ROOT; + preftree->root = RB_ROOT_CACHED; preftree->count = 0; } @@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * freeing the entire indirect tree when we're done. In some test * cases, the tree can grow quite large (~200k objects). */ - while ((rnode = rb_first(&preftrees->indirect.root))) { + while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; ref = rb_entry(rnode, struct prelim_ref, rbnode); @@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, goto out; } - rb_erase(&ref->rbnode, &preftrees->indirect.root); + rb_erase_cached(&ref->rbnode, &preftrees->indirect.root); preftrees->indirect.count--; if (ref->count == 0) { @@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct preftree *tree = &preftrees->indirect_missing_keys; struct rb_node *node; - while ((node = rb_first(&tree->root))) { + while ((node = rb_first_cached(&tree->root))) { ref = rb_entry(node, struct prelim_ref, rbnode); - rb_erase(node, &tree->root); + rb_erase_cached(node, &tree->root); BUG_ON(ref->parent); /* should not be a direct ref */ BUG_ON(ref->key_for_search.type); @@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); spin_lock(&head->lock); - for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { + for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); if (node->seq > seq) @@ -1229,14 +1231,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (ret) goto out; - WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, extent_item_pos, total_refs, sc, ignore_offset); if (ret) goto out; - WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root)); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root)); /* * This walks the tree of merged and resolved refs. Tree blocks are @@ -1245,7 +1247,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, * * We release the entire tree in one go before returning. */ - node = rb_first(&preftrees.direct.root); + node = rb_first_cached(&preftrees.direct.root); while (node) { ref = rb_entry(node, struct prelim_ref, rbnode); node = rb_next(&ref->rbnode); @@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->objectid, + .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, }; @@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", - cur, found_key.objectid, fs_root->objectid); + cur, found_key.objectid, + fs_root->root_key.objectid); ret = iterate(parent, name_len, (unsigned long)(iref + 1), eb, ctx); if (ret) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1343ac57b438..97d91e55b70a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) { - u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME); #if BITS_PER_LONG == 32 h = (h >> 32) ^ (h & 0xffffffff); @@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; /* Output minus objectid, which is more meaningful */ - if (root->objectid >= BTRFS_LAST_FREE_OBJECTID) + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(inode), + root->root_key.objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); else btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(inode), + root->root_key.objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 833cf3c35b4d..2e43fba44035 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, { unsigned int num_pages; unsigned int i; + size_t size; u64 dev_bytenr; int ret; @@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; - block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) + - sizeof(*block_ctx->pagev), - num_pages, GFP_NOFS); + size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); + block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); if (!block_ctx->mem_to_free) return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9bfa66592aa7..8703ce68fe9d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -528,7 +528,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree; struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned long compressed_len; @@ -545,7 +544,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int faili = 0; u32 *sums; - tree = &BTRFS_I(inode)->io_tree; em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d436fb4c002e..2ee43b6a4f09 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) } } -/* - * reset all the locked nodes in the patch to spinning locks. - * - * held is used to keep lockdep happy, when lockdep is enabled - * we set held to a blocking lock before we go around and - * retake all the spinlocks in the path. You can safely use NULL - * for held - */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw) -{ - int i; - - if (held) { - btrfs_set_lock_blocking_rw(held, held_rw); - if (held_rw == BTRFS_WRITE_LOCK) - held_rw = BTRFS_WRITE_LOCK_BLOCKING; - else if (held_rw == BTRFS_READ_LOCK) - held_rw = BTRFS_READ_LOCK_BLOCKING; - } - btrfs_set_path_blocking(p); - - for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) { - btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) - p->locks[i] = BTRFS_WRITE_LOCK; - else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) - p->locks[i] = BTRFS_READ_LOCK; - } - } - - if (held) - btrfs_clear_lock_blocking_rw(held, held_rw); -} - /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ - if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else @@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } } - btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1815,8 +1778,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int orig_slot = path->slots[level]; u64 orig_ptr; - if (level == 0) - return 0; + ASSERT(level > 0); mid = path->nodes[level]; @@ -2483,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -2504,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -2789,7 +2749,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } cow_done: p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); + /* + * Leave path with blocking locks to avoid massive + * lock context switch, this is made on purpose. + */ /* * we have a lock on b and as long as we aren't changing @@ -2871,8 +2834,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_WRITE_LOCK); } p->locks[level] = BTRFS_WRITE_LOCK; } else { @@ -2880,8 +2841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); } p->locks[level] = BTRFS_READ_LOCK; } @@ -2900,7 +2859,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -2910,7 +2868,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } if (!p->search_for_split) unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); + min_write_lock_level, NULL); goto done; } } @@ -2961,13 +2919,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = get_old_root(root, time_seq); + if (!b) { + ret = -EIO; + goto done; + } level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; while (b) { level = btrfs_header_level(b); p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -3013,8 +2974,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); } b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { @@ -5198,7 +5157,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1, 0, NULL); - btrfs_clear_path_blocking(path, NULL, 0); } out: path->keep_locks = keep_locks; @@ -5783,8 +5741,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5820,8 +5776,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..68ca41dbbef3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define STATIC noinline -#else -#define STATIC static noinline -#endif - #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -367,11 +361,13 @@ struct btrfs_dev_replace { struct mutex lock_finishing_cancel_unmount; rwlock_t lock; - atomic_t read_locks; atomic_t blocking_readers; wait_queue_head_t read_lock_wq; struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; }; /* For raid type sysfs entries */ @@ -1094,9 +1090,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; - struct semaphore uuid_tree_rescan_sem; /* Used to reclaim the metadata space in the background. */ @@ -1202,18 +1195,12 @@ struct btrfs_root { int last_log_commit; pid_t log_start_pid; - u64 objectid; u64 last_trans; u32 type; u64 highest_objectid; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ - u64 alloc_bytenr; -#endif - u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -1286,6 +1273,10 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + u64 alloc_bytenr; +#endif }; struct btrfs_file_private { @@ -2607,10 +2598,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2771,7 +2760,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - int update_size); + bool update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); @@ -2877,8 +2866,6 @@ void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3021,8 +3008,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, @@ -3180,8 +3166,8 @@ void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + struct page *page, size_t pg_offset, + u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); @@ -3201,9 +3187,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); -#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3716,18 +3699,19 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); -#endif static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, - &fs_info->fs_state))) - return 1; -#endif + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} +#else +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ return 0; } +#endif static inline void cond_wake_up(struct wait_queue_head *wq) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f51b509f2d9b..c669f250d4a0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); - delayed_node->ins_root = RB_ROOT; - delayed_node->del_root = RB_ROOT; + delayed_node->ins_root = RB_ROOT_CACHED; + delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); @@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( struct btrfs_delayed_node *delayed_node, struct btrfs_key *key) { - return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key, NULL, NULL); } @@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, { struct rb_node **p, *node; struct rb_node *parent_node = NULL; - struct rb_root *root; + struct rb_root_cached *root; struct btrfs_delayed_item *item; int cmp; + bool leftmost = true; if (action == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; @@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, root = &delayed_node->del_root; else BUG(); - p = &root->rb_node; + p = &root->rb_root.rb_node; node = &ins->rb_node; while (*p) { @@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_node); cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) + if (cmp < 0) { p = &(*p)->rb_right; - else if (cmp > 0) + leftmost = false; + } else if (cmp > 0) { p = &(*p)->rb_left; - else + } else { return -EEXIST; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; ins->ins_or_del = action; @@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { - struct rb_root *root; + struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; @@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) else root = &delayed_item->delayed_node->del_root; - rb_erase(&delayed_item->rb_node, root); + rb_erase_cached(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; finish_one_item(delayed_root); @@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct rb_node *p; struct btrfs_delayed_item *item = NULL; - p = rb_first(&delayed_node->ins_root); + p = rb_first_cached(&delayed_node->ins_root); if (p) item = rb_entry(p, struct btrfs_delayed_item, rb_node); @@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct rb_node *p; struct btrfs_delayed_item *item = NULL; - p = rb_first(&delayed_node->del_root); + p = rb_first_cached(&delayed_node->del_root); if (p) item = rb_entry(p, struct btrfs_delayed_item, rb_node); @@ -559,7 +562,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, * reserved space when starting a transaction. So no need to reserve * qgroup space here. */ - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, @@ -647,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; } - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); @@ -762,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, i++; } - /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL, 0); - /* insert the keys of the items */ setup_items_for_insert(root, path, keys, data_size, total_data_size, total_size, nitems); @@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->objectid, + name_len, name, delayed_node->root->root_key.objectid, delayed_node->inode_id, ret); BUG(); } @@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, node->root->objectid, node->inode_id, ret); + index, node->root->root_key.objectid, + node->inode_id, ret); BUG(); } mutex_unlock(&node->mutex); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 33536cd681d4..74ae226ffaf0 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -50,8 +50,8 @@ struct btrfs_delayed_node { * is waiting to be dealt with by the async worker. */ struct list_head p_list; - struct rb_root ins_root; - struct rb_root del_root; + struct rb_root_cached ins_root; + struct rb_root_cached del_root; struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 62ff545ba1f7..5149165b49a4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, } /* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, struct rb_node *node) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_head *entry; struct btrfs_delayed_ref_head *ins; u64 bytenr; + bool leftmost = true; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); bytenr = ins->bytenr; @@ -117,26 +118,29 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->bytenr) + if (bytenr < entry->bytenr) { p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) + } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return entry; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); return NULL; } -static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; + bool leftmost = true; while (*p) { int comp; @@ -145,16 +149,18 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, ref_node); comp = comp_refs(ins, entry, true); - if (comp < 0) + if (comp < 0) { p = &(*p)->rb_left; - else if (comp > 0) + } else if (comp > 0) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return entry; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); return NULL; } @@ -162,12 +168,14 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. + * match is found. But if no bigger one is found then the first node of the + * ref head tree will be returned. */ -static struct btrfs_delayed_ref_head * -find_ref_head(struct rb_root *root, u64 bytenr, - int return_bigger) +static struct btrfs_delayed_ref_head* find_ref_head( + struct btrfs_delayed_ref_root *dr, u64 bytenr, + bool return_bigger) { + struct rb_root *root = &dr->href_root.rb_root; struct rb_node *n; struct btrfs_delayed_ref_head *entry; @@ -187,7 +195,7 @@ find_ref_head(struct rb_root *root, u64 bytenr, if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first(root); + n = rb_first_cached(&dr->href_root); entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); return entry; @@ -197,12 +205,9 @@ find_ref_head(struct rb_root *root, u64 bytenr, return NULL; } -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return 0; @@ -227,7 +232,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref) { lockdep_assert_held(&head->lock); - rb_erase(&ref->ref_node, &head->ref_tree); + rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -296,7 +301,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, lockdep_assert_held(&head->lock); - if (RB_EMPTY_ROOT(&head->ref_tree)) + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return; /* We don't have too many refs to merge for data. */ @@ -314,7 +319,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&fs_info->tree_mod_seq_lock); again: - for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; @@ -345,24 +351,21 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) return ret; } -struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans) +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs) { - struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; u64 start; bool loop = false; - delayed_refs = &trans->transaction->delayed_refs; - again: start = delayed_refs->run_delayed_start; - head = find_ref_head(&delayed_refs->href_root, start, 1); + head = find_ref_head(delayed_refs, start, true); if (!head && !loop) { delayed_refs->run_delayed_start = 0; start = 0; loop = true; - head = find_ref_head(&delayed_refs->href_root, start, 1); + head = find_ref_head(delayed_refs, start, true); if (!head) return NULL; } else if (!head && loop) { @@ -569,7 +572,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; head_ref->is_system = is_system; - head_ref->ref_tree = RB_ROOT; + head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; @@ -903,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { - return find_ref_head(&delayed_refs->href_root, bytenr, 0); + return find_ref_head(delayed_refs, bytenr, false); } void __cold btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d9f2a4ebd5db..8e20c5cb5404 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_tree; + struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; @@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_root { /* head ref rbtree */ - struct rb_root href_root; + struct rb_root_cached href_root; /* dirty extent records */ struct rb_root dirty_extent_root; @@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { @@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) } -struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans); +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dec01970d8c5..2aa48aecc52b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -382,14 +382,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, return ret; } -void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - - dev_replace->committed_cursor_left = - dev_replace->cursor_left_last_write_of_item; -} - static char* btrfs_dev_name(struct btrfs_device *device) { if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) @@ -408,11 +400,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + bool need_unlock; - ret = btrfs_find_device_by_devspec(fs_info, srcdevid, - srcdev_name, &src_device); - if (ret) - return ret; + src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, + srcdev_name); + if (IS_ERR(src_device)) + return PTR_ERR(src_device); ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); @@ -432,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } + need_unlock = true; btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -440,6 +434,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } @@ -470,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); btrfs_dev_replace_write_unlock(dev_replace); + need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -481,7 +477,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + need_unlock = true; btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; goto leave; } @@ -503,9 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; - btrfs_dev_replace_write_unlock(dev_replace); + if (need_unlock) + btrfs_dev_replace_write_unlock(dev_replace); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - wait_event(fs_info->replace_wait, !percpu_counter_sum( - &fs_info->bio_counter)); + wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( + &fs_info->dev_replace.bio_counter)); } /* @@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->dev_replace.replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -961,13 +961,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) { read_lock(&dev_replace->lock); - atomic_inc(&dev_replace->read_locks); } void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) { - ASSERT(atomic_read(&dev_replace->read_locks) > 0); - atomic_dec(&dev_replace->read_locks); read_unlock(&dev_replace->lock); } @@ -985,7 +982,6 @@ void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) { - ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); write_unlock(&dev_replace->lock); } @@ -994,45 +990,31 @@ void btrfs_dev_replace_set_lock_blocking( struct btrfs_dev_replace *dev_replace) { /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->read_locks) > 0); atomic_inc(&dev_replace->blocking_readers); read_unlock(&dev_replace->lock); } -/* acquire read lock and dec blocking cnt */ -void btrfs_dev_replace_clear_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->read_locks) > 0); - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - read_lock(&dev_replace->lock); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { - percpu_counter_inc(&fs_info->bio_counter); + percpu_counter_inc(&fs_info->dev_replace.bio_counter); } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { - percpu_counter_sub(&fs_info->bio_counter, amount); - cond_wake_up_nomb(&fs_info->replace_wait); + percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); + cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); } void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { while (1) { - percpu_counter_inc(&fs_info->bio_counter); + percpu_counter_inc(&fs_info->dev_replace.bio_counter); if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state))) break; btrfs_bio_counter_dec(fs_info); - wait_event(fs_info->replace_wait, + wait_event(fs_info->dev_replace.replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); } diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index b6d4206188bb..795c551f5b5e 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, @@ -28,12 +27,5 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_clear_lock_blocking( - struct btrfs_dev_replace *dev_replace); - -static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) -{ - atomic64_inc(stat_value); -} #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a678b07fcf01..8de74d835dba 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, const char *name, int name_len, - struct btrfs_inode *dir, struct btrfs_key *location, - u8 type, u64 index) +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, + int name_len, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; + struct btrfs_root *root = dir->root; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 05dc3c17cb62..b0ab41da91d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -125,8 +125,8 @@ struct async_submit_bio { * Different roots are used for different purposes and may nest inside each * other and they require separate keysets. As lockdep keys should be * static, assign keysets according to the purpose of the root as indicated - * by btrfs_root->objectid. This ensures that all special purpose roots - * have separate keysets. + * by btrfs_root->root_key.objectid. This ensures that all special purpose + * roots have separate keysets. * * Lock-nesting across peer nodes is always done with the immediate parent * node locked thus preventing deadlock. As lockdep doesn't know this, use @@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; root->orphan_cleanup_state = 0; - root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; root->nr_delalloc_inodes = 0; @@ -2156,9 +2155,8 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.read_locks, 0); atomic_set(&fs_info->dev_replace.blocking_readers, 0); - init_waitqueue_head(&fs_info->replace_wait); + init_waitqueue_head(&fs_info->dev_replace.replace_wait); init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } @@ -2648,7 +2646,8 @@ int open_ctree(struct super_block *sb, goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; @@ -3309,7 +3308,7 @@ int open_ctree(struct super_block *sb, iput(fs_info->btree_inode); fail_bio_counter: - percpu_counter_destroy(&fs_info->bio_counter); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: @@ -3977,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); + ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); @@ -4018,7 +4018,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->bio_counter); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); @@ -4204,7 +4204,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } - while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; bool pin_bytes = false; @@ -4222,11 +4222,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((n = rb_first(&head->ref_tree)) != NULL) { + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); ref->in_tree = 0; - rb_erase(&ref->ref_node, &head->ref_tree); + rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -4240,7 +4240,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1f3755b3a37a..ddf28ecf17f9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = btrfs_ino(BTRFS_I(inode)); - fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; fid->gen = inode->i_generation; if (parent) { @@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->objectid; + parent_root_id = BTRFS_I(parent)->root->root_key.objectid; if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2d9074295d7f..a4cd0221bc8d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2374,7 +2374,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; - if (RB_EMPTY_ROOT(&head->ref_tree)) + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return NULL; /* @@ -2387,7 +2387,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); - ref = rb_entry(rb_first(&head->ref_tree), + ref = rb_entry(rb_first_cached(&head->ref_tree), struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; @@ -2448,13 +2448,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&head->lock); spin_lock(&delayed_refs->lock); spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); return 1; } delayed_refs->num_heads--; - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -2502,102 +2502,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return 0; } -/* - * Returns 0 on success or if called with an already aborted transaction. - * Returns -ENOMEM or -EIO on failure and will abort the transaction. - */ -static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) +static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( + struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_delayed_ref_head *head = NULL; + int ret; + + spin_lock(&delayed_refs->lock); + head = btrfs_select_ref_head(delayed_refs); + if (!head) { + spin_unlock(&delayed_refs->lock); + return head; + } + + /* + * Grab the lock that says we are going to process all the refs for + * this head + */ + ret = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (ret == -EAGAIN) + head = ERR_PTR(-EAGAIN); + + return head; +} + +static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *locked_ref, + unsigned long *run_refs) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; - ktime_t start = ktime_get(); - int ret; - unsigned long count = 0; - unsigned long actual_count = 0; + struct btrfs_delayed_ref_node *ref; int must_insert_reserved = 0; + int ret; delayed_refs = &trans->transaction->delayed_refs; - while (1) { - if (!locked_ref) { - if (count >= nr) - break; - spin_lock(&delayed_refs->lock); - locked_ref = btrfs_select_ref_head(trans); - if (!locked_ref) { - spin_unlock(&delayed_refs->lock); - break; - } + lockdep_assert_held(&locked_ref->mutex); + lockdep_assert_held(&locked_ref->lock); - /* grab the lock that says we are going to process - * all the refs for this head */ - ret = btrfs_delayed_ref_lock(trans, locked_ref); - spin_unlock(&delayed_refs->lock); - /* - * we may have dropped the spin lock to get the head - * mutex lock, and that might have given someone else - * time to free the head. If that's true, it has been - * removed from our list and we can move on. - */ - if (ret == -EAGAIN) { - locked_ref = NULL; - count++; - continue; - } - } - - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - * Or we can get node references of the same type that weren't - * merged when created due to bumps in the tree mod seq, and - * we need to merge them to prevent adding an inline extent - * backref before dropping it (triggering a BUG_ON at - * insert_inline_extent_backref()). - */ - spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - - ref = select_delayed_ref(locked_ref); - - if (ref && ref->seq && + while ((ref = select_delayed_ref(locked_ref))) { + if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); unselect_delayed_ref_head(delayed_refs, locked_ref); - locked_ref = NULL; - cond_resched(); - count++; - continue; + return -EAGAIN; } - /* - * We're done processing refs in this ref_head, clean everything - * up and move on to the next ref_head. - */ - if (!ref) { - ret = cleanup_ref_head(trans, locked_ref); - if (ret > 0 ) { - /* We dropped our lock, we need to loop. */ - ret = 0; - continue; - } else if (ret) { - return ret; - } - locked_ref = NULL; - count++; - continue; - } - - actual_count++; + (*run_refs)++; ref->in_tree = 0; - rb_erase(&ref->ref_node, &locked_ref->ref_tree); + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -2619,8 +2583,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); /* - * Record the must-insert_reserved flag before we drop the spin - * lock. + * Record the must_insert_reserved flag before we drop the + * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; locked_ref->must_insert_reserved = 0; @@ -2642,10 +2606,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } btrfs_put_delayed_ref(ref); - count++; cond_resched(); + + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); } + return 0; +} + +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + unsigned long nr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *locked_ref = NULL; + ktime_t start = ktime_get(); + int ret; + unsigned long count = 0; + unsigned long actual_count = 0; + + delayed_refs = &trans->transaction->delayed_refs; + do { + if (!locked_ref) { + locked_ref = btrfs_obtain_ref_head(trans); + if (IS_ERR_OR_NULL(locked_ref)) { + if (PTR_ERR(locked_ref) == -EAGAIN) { + continue; + } else { + break; + } + } + count++; + } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, + &actual_count); + if (ret < 0 && ret != -EAGAIN) { + /* + * Error, btrfs_run_delayed_refs_for_head already + * unlocked everything so just bail out + */ + return ret; + } else if (!ret) { + /* + * Success, perform the usual cleanup of a processed + * head + */ + ret = cleanup_ref_head(trans, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; + continue; + } else if (ret) { + return ret; + } + } + + /* + * Either success case or btrfs_run_delayed_refs_for_head + * returned -EAGAIN, meaning we need to select another head + */ + + locked_ref = NULL; + cond_resched(); + } while ((nr != -1 && count < nr) || locked_ref); + /* * We don't want to include ref heads since we can have empty ref heads * and those will drastically skew our runtime down since we just do @@ -2745,9 +2789,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; @@ -2782,8 +2826,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, return ret; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) { u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); @@ -2791,14 +2834,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 val; smp_mb(); - avg_runtime = fs_info->avg_delayed_ref_runtime; + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; val = num_entries * avg_runtime; if (val >= NSEC_PER_SEC) return 1; if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans, fs_info); + return btrfs_check_space_for_delayed_refs(trans); } struct async_delayed_refs { @@ -2940,7 +2983,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first(&delayed_refs->href_root); + node = rb_first_cached(&delayed_refs->href_root); if (!node) { spin_unlock(&delayed_refs->lock); goto out; @@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, * XXX: We should replace this with a proper search function in the * future. */ - for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, { struct btrfs_path *path; int ret; - int ret2; path = btrfs_alloc_path(); if (!path) @@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, if (ret && ret != -ENOENT) goto out; - ret2 = check_delayed_ref(root, path, objectid, - offset, bytenr); - } while (ret2 == -EAGAIN); + ret = check_delayed_ref(root, path, objectid, offset, bytenr); + } while (ret == -EAGAIN); - if (ret2 && ret2 != -ENOENT) { - ret = ret2; - goto out; - } - - if (ret != -ENOENT || ret2 != -ENOENT) - ret = 0; out: btrfs_free_path(path); if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) @@ -5284,7 +5319,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, } static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int update_size) + u64 num_bytes, bool update_size) { spin_lock(&block_rsv->lock); block_rsv->reserved += num_bytes; @@ -5316,7 +5351,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, global_rsv->full = 0; spin_unlock(&global_rsv->lock); - block_rsv_add_bytes(dest, num_bytes, 1); + block_rsv_add_bytes(dest, num_bytes, true); return 0; } @@ -5479,7 +5514,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, struct btrfs_block_rsv *dst, u64 num_bytes, - int update_size) + bool update_size) { int ret; @@ -5539,10 +5574,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return 0; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } + if (!ret) + block_rsv_add_bytes(block_rsv, num_bytes, true); return ret; } @@ -5587,7 +5620,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); + block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; } @@ -5629,7 +5662,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); + block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), num_bytes, 1); @@ -5835,7 +5868,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); @@ -6399,10 +6432,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - ram_bytes, 0); space_info->bytes_may_use -= ram_bytes; if (delalloc) cache->delalloc_bytes += num_bytes; @@ -6424,11 +6453,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, * reserve set to 0 in order to clear the reservation. */ -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc) +static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; - int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); @@ -6441,7 +6469,6 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - return ret; } void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { @@ -6925,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree)) + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; if (head->extent_op) { @@ -6946,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); @@ -8119,6 +8146,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; + /* + * Extra safety check in case the extent tree is corrupted and extent + * allocator chooses to use a tree block which is already used and + * locked. + */ + if (buf->lock_owner == current->pid) { + btrfs_err_rl(fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + buf->start, btrfs_header_owner(buf), current->pid); + free_extent_buffer(buf); + return ERR_PTR(-EUCLEAN); + } + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(fs_info, buf); @@ -8215,7 +8255,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { - block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_add_bytes(block_rsv, blocksize, false); block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } @@ -8642,7 +8682,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, parent = 0; } - if (need_account) { + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have + * already accounted them at merge time (replace_path), + * thus we could skip expensive subtree trace here. + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + need_account) { ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { @@ -8763,15 +8809,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(eb)); + else if (root->root_key.objectid != btrfs_header_owner(eb)) + goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])); + else if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])) + goto owner_mismatch; } btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); @@ -8779,6 +8824,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, wc->refs[level] = 0; wc->flags[level] = 0; return 0; + +owner_mismatch: + btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", + btrfs_header_owner(eb), root->root_key.objectid); + return -EUCLEAN; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -8832,6 +8882,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, ret = walk_up_proc(trans, root, path, wc); if (ret > 0) return 0; + if (ret < 0) + return ret; if (path->locks[level]) { btrfs_tree_unlock_rw(path->nodes[level], @@ -8875,7 +8927,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); path = btrfs_alloc_path(); if (!path) { @@ -9613,6 +9665,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group = btrfs_lookup_first_block_group(info, last); while (block_group) { + wait_block_group_cache_done(block_group); spin_lock(&block_group->lock); if (block_group->iref) break; @@ -10074,7 +10127,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group_cache *block_group; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; struct btrfs_key key; @@ -10082,7 +10135,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) bool can_flush_pending_bgs = trans->can_flush_pending_bgs; trans->can_flush_pending_bgs = false; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, + bg_list); if (ret) goto next; @@ -10753,14 +10809,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * We don't want a transaction for this since the discard may take a * substantial amount of time. We don't require that a transaction be * running, but we do need to take a running transaction into account - * to ensure that we're not discarding chunks that were released in - * the current transaction. + * to ensure that we're not discarding chunks that were released or + * allocated in the current transaction. * * Holding the chunks lock will prevent other threads from allocating * or releasing chunks, but it won't prevent a running transaction * from committing and releasing the memory that the pending chunks * list head uses. For that, we need to take a reference to the - * transaction. + * transaction and hold the commit root sem. We only need to hold + * it while performing the free space search since we have already + * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 minlen, u64 *trimmed) @@ -10770,6 +10828,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, *trimmed = 0; + /* Discard not supported = nothing to do. */ + if (!blk_queue_discard(bdev_get_queue(device->bdev))) + return 0; + /* Not writeable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; @@ -10787,9 +10849,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) - return ret; + break; - down_read(&fs_info->commit_root_sem); + ret = down_read_killable(&fs_info->commit_root_sem); + if (ret) { + mutex_unlock(&fs_info->chunk_mutex); + break; + } spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; @@ -10797,13 +10863,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); + if (!trans) + up_read(&fs_info->commit_root_sem); + ret = find_free_dev_extent_start(trans, device, minlen, start, &start, &len); - if (trans) + if (trans) { + up_read(&fs_info->commit_root_sem); btrfs_put_transaction(trans); + } if (ret) { - up_read(&fs_info->commit_root_sem); mutex_unlock(&fs_info->chunk_mutex); if (ret == -ENOSPC) ret = 0; @@ -10811,7 +10881,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, } ret = btrfs_issue_discard(device->bdev, start, len, &bytes); - up_read(&fs_info->commit_root_sem); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -10831,6 +10900,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, return ret; } +/* + * Trim the whole filesystem by: + * 1) trimming the free space in each block group + * 2) trimming the unallocated space on each device + * + * This will also continue trimming even if a block group or device encounters + * an error. The return value will be the last error, or 0 if nothing bad + * happens. + */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { struct btrfs_block_group_cache *cache = NULL; @@ -10840,18 +10918,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + u64 bg_failed = 0; + u64 dev_failed = 0; + int bg_ret = 0; + int dev_ret = 0; int ret = 0; - /* - * try to trim all FS space, our block group may start from non-zero. - */ - if (range->len == total_bytes) - cache = btrfs_lookup_first_block_group(fs_info, range->start); - else - cache = btrfs_lookup_block_group(fs_info, range->start); - - while (cache) { + cache = btrfs_lookup_first_block_group(fs_info, range->start); + for (; cache; cache = next_block_group(fs_info, cache)) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); break; @@ -10865,13 +10939,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } ret = wait_block_group_cache_done(cache); if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } } ret = btrfs_trim_block_group(cache, @@ -10882,28 +10958,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } } - - cache = next_block_group(fs_info, cache); } + if (bg_failed) + btrfs_warn(fs_info, + "failed to trim %llu block group(s), last error %d", + bg_failed, bg_ret); mutex_lock(&fs_info->fs_devices->device_list_mutex); - devices = &fs_info->fs_devices->alloc_list; - list_for_each_entry(device, devices, dev_alloc_list) { + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { ret = btrfs_trim_free_extents(device, range->minlen, &group_trimmed); - if (ret) + if (ret) { + dev_failed++; + dev_ret = ret; break; + } trimmed += group_trimmed; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (dev_failed) + btrfs_warn(fs_info, + "failed to trim %llu device(s), last error %d", + dev_failed, dev_ret); range->len = trimmed; - return ret; + if (bg_ret) + return bg_ret; + return dev_ret; } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dd6faab02bb..6877a74c7469 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state) { struct extent_state *state; - struct rb_node *n; int ret = 1; spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; if (state->end == start - 1 && extent_state_in_tree(state)) { - n = rb_next(&state->rb_node); - while (n) { - state = rb_entry(n, struct extent_state, - rb_node); + while ((state = next_state(state)) != NULL) { if (state->state & bits) goto got_it; - n = rb_next(n); } free_extent_state(*cached_state); *cached_state = NULL; @@ -1568,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, * * 1 is returned if we find something, 0 if nothing was in the tree */ -STATIC u64 find_lock_delalloc_range(struct inode *inode, +static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes) @@ -1648,6 +1643,17 @@ STATIC u64 find_lock_delalloc_range(struct inode *inode, return found; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +u64 btrfs_find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) +{ + return find_lock_delalloc_range(inode, tree, locked_page, start, end, + max_bytes); +} +#endif + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@ -5165,11 +5171,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_buffer *eb) +bool set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; - int was_dirty = 0; + bool was_dirty; check_buffer_tree_ref(eb); @@ -5179,8 +5185,15 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + if (!was_dirty) + for (i = 0; i < num_pages; i++) + set_page_dirty(eb->pages[i]); + +#ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + ASSERT(PageDirty(eb->pages[i])); +#endif + return was_dirty; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b4d03e677e1d..369daa5d4f73 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_buffer *eb); +bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); @@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -noinline u64 find_lock_delalloc_range(struct inode *inode, +u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6648d55e5339..7eea8b6e2cd3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -34,7 +34,7 @@ void __cold extent_map_exit(void) */ void extent_map_tree_init(struct extent_map_tree *tree) { - tree->map = RB_ROOT; + tree->map = RB_ROOT_CACHED; INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static int tree_insert(struct rb_root *root, struct extent_map *em) +static int tree_insert(struct rb_root_cached *root, struct extent_map *em) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; struct extent_map *entry = NULL; struct rb_node *orig_parent = NULL; u64 end = range_end(em->start, em->len); + bool leftmost = true; while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - if (em->start < entry->start) + if (em->start < entry->start) { p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + } else if (em->start >= extent_map_end(entry)) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return -EEXIST; + } } orig_parent = parent; @@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); - rb_insert_color(&em->rb_node, root); + rb_insert_color_cached(&em->rb_node, root, leftmost); return 0; } @@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - rb_erase(&merge->rb_node, &tree->map); + rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } @@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (rb && mergable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; - rb_erase(&merge->rb_node, &tree->map); + rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); @@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map, start, &prev, &next); + rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next); if (!rb_node) { if (prev) rb_node = prev; @@ -428,16 +431,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Removes @em from @tree. No reference counts are dropped, and no checks * are done to see if the range is in use */ -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { - int ret = 0; - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - rb_erase(&em->rb_node, &tree->map); + rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); - return ret; } void replace_extent_mapping(struct extent_map_tree *tree, @@ -449,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) list_del_init(&cur->list); - rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); setup_extent_mapping(tree, new, modified); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 25d985e7532a..31977ffd6190 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -49,7 +49,7 @@ struct extent_map { }; struct extent_map_tree { - struct rb_root map; + struct rb_root_cached map; struct list_head modified_extents; rwlock_t lock; }; @@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *cur, struct extent_map *new, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2be00e873e92..15b925142793 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, end_of_last_block = start_pos + num_bytes - 1; + /* + * The pages may have already been dirty, clear out old accounting so + * we can set things up properly + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { if (start_pos >= isize && !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { @@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - clear_extent_bit(&inode->io_tree, start_pos, last_pos, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state); + *lockstart = start_pos; *lockend = last_pos; ret = 1; } + /* + * It's possible the pages are dirty right now, but we don't want + * to clean them yet because copy_from_user may catch a page fault + * and we might have to fall back to one page at a time. If that + * happens, we'll unlock these pages and we'd have a window where + * reclaim could sneak in and drop the once-dirty page on the floor + * without writing it. + * + * We have the pages locked and the extent range locked, so there's + * no way someone can start IO on any dirty pages in this range. + * + * We'll call btrfs_dirty_pages() later on, and that will flip around + * delalloc bits and dirty the pages as required. + */ for (i = 0; i < num_pages; i++) { - if (clear_page_dirty_for_io(pages[i])) - account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } @@ -2544,7 +2561,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); trans->block_rsv = rsv; @@ -2594,7 +2611,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0adf38b00fa0..67441219d6c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct inode *inode = NULL; + unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_disk_key_to_cpu(&location, &disk_key); btrfs_release_path(path); + /* + * We are often under a trans handle at this point, so we need to make + * sure NOFS is set to keep us from deadlocking. + */ + nofs_flag = memalloc_nofs_save(); inode = btrfs_iget(fs_info->sb, &location, root, NULL); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return inode; @@ -1679,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; + if (info->max_extent_size > ctl->unit) + info->max_extent_size = 0; } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -2110,8 +2120,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, out: if (info) { - if (info->bitmap) - kfree(info->bitmap); + kfree(info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } @@ -3601,8 +3610,7 @@ int test_add_free_space_entry(struct btrfs_block_group_cache *cache, if (info) kmem_cache_free(btrfs_free_space_cachep, info); - if (map) - kfree(map); + kfree(map); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ea5339603cf..f22f77172c5f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; -static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; static const struct extent_io_ops btrfs_extent_io_ops; @@ -2750,12 +2749,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) struct btrfs_path *path; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; - struct inode *inode; struct rb_node *node; int ret; - inode = new->inode; - path = btrfs_alloc_path(); if (!path) return; @@ -3471,8 +3467,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* this will do delete_inode and everything for us */ iput(inode); - if (ret) - goto out; } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3738,7 +3732,7 @@ static int btrfs_read_locked_inode(struct inode *inode) case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -3910,12 +3904,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto err; - } - if (!di) { - ret = -ENOENT; + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto err; } leaf = path->nodes[0]; @@ -4075,10 +4065,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -4270,18 +4257,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) * again is not run concurrently. */ spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - if (dest->send_in_progress == 0) { - btrfs_set_root_flags(&dest->root_item, - root_flags | BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } else { + if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); return -EPERM; } + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); down_write(&fs_info->subvol_sem); @@ -4727,7 +4713,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans, fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, trans->transid, 0); @@ -4736,8 +4722,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, extent_num_bytes)) { should_end = true; } - if (btrfs_should_throttle_delayed_refs(trans, - fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } } @@ -5235,10 +5220,10 @@ static void evict_inode_truncate_pages(struct inode *inode) truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map)) { + while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { struct extent_map *em; - node = rb_first(&map_tree->map); + node = rb_first_cached(&map_tree->map); em = rb_entry(node, struct extent_map, rb_node); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5306,8 +5291,7 @@ static void evict_inode_truncate_pages(struct inode *inode) } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - u64 min_size) + struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -5317,7 +5301,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_trans_handle *trans; int ret; - ret = btrfs_block_rsv_refill(root, rsv, min_size, + ret = btrfs_block_rsv_refill(root, rsv, rsv->size, BTRFS_RESERVE_FLUSH_LIMIT); if (ret && ++failures > 2) { @@ -5334,8 +5318,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) + if (!btrfs_check_space_for_delayed_refs(trans) && + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) return trans; /* If not, commit and try again. */ @@ -5351,7 +5335,6 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - u64 min_size; int ret; trace_btrfs_inode_evict(inode); @@ -5361,8 +5344,6 @@ void btrfs_evict_inode(struct inode *inode) return; } - min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); - evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5373,9 +5354,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - if (!special_file(inode->i_mode)) - btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); @@ -5395,13 +5373,13 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = min_size; + rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; @@ -5426,7 +5404,7 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); @@ -5471,12 +5449,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), name, namelen, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -6390,8 +6364,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6584,7 +6557,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->objectid != BTRFS_I(inode)->root->objectid) + if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6777,9 +6750,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * This also copies inline extents directly into the page. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; @@ -6823,19 +6796,21 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->len = (u64)-1; em->block_len = (u64)-1; + path = btrfs_alloc_path(); if (!path) { - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - /* - * Chances are we'll be called again, so go ahead and do - * readahead - */ - path->reada = READA_FORWARD; + err = -ENOMEM; + goto out; } + /* Chances are we'll be called again, so go ahead and do readahead */ + path->reada = READA_FORWARD; + + /* + * Unless we're going to uncompress the inline extent, no sleep would + * happen. + */ + path->leave_spinning = 1; + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { err = ret; @@ -6938,6 +6913,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; + + btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { @@ -6985,10 +6962,10 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: + btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - btrfs_free_path(path); if (err) { free_extent_map(em); return ERR_PTR(err); @@ -9021,7 +8998,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); /* @@ -9058,7 +9035,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_block_rsv_release(fs_info, rsv, -1); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } @@ -10191,7 +10168,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); @@ -10567,13 +10544,6 @@ static const struct address_space_operations btrfs_aops = { .error_remove_page = generic_error_remove_page, }; -static const struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, -}; - static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d60b6caf09e8..a990a9045139 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; - if (range.start > total_bytes || - range.len < fs_info->sb->s_blocksize) + + /* + * NOTE: Don't truncate the range using super->total_bytes. Bytenr of + * block group is in the logical address space, which can be any + * sectorsize aligned bytenr in the range [0, U64_MAX]. + */ + if (range.len < fs_info->sb->s_blocksize) return -EINVAL; - range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); if (ret < 0) @@ -686,8 +689,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - ret = btrfs_insert_dir_item(trans, root, - name, namelen, BTRFS_I(dir), &key, + ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1324,7 +1326,7 @@ static int cluster_pages_for_defrag(struct inode *inode, if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, @@ -4393,7 +4395,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->objectid)) { + if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; goto out; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d4917c0cddf5..45868fd76209 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (!qgroup) { ret = -ENOENT; goto out; - } else { - /* check if there are no children of this qgroup */ - if (!list_empty(&qgroup->members)) { - ret = -EBUSY; - goto out; - } } + + /* Check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; @@ -1712,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) return 0; } +/* + * Helper function to trace a subtree tree block swap. + * + * The swap will happen in highest tree block, but there may be a lot of + * tree blocks involved. + * + * For example: + * OO = Old tree blocks + * NN = New tree blocks allocated during balance + * + * File tree (257) Reloc tree for 257 + * L2 OO NN + * / \ / \ + * L1 OO OO (a) OO NN (a) + * / \ / \ / \ / \ + * L0 OO OO OO OO OO OO NN NN + * (b) (c) (b) (c) + * + * When calling qgroup_trace_extent_swap(), we will pass: + * @src_eb = OO(a) + * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] + * @dst_level = 0 + * @root_level = 1 + * + * In that case, qgroup_trace_extent_swap() will search from OO(a) to + * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. + * + * The main work of qgroup_trace_extent_swap() can be split into 3 parts: + * + * 1) Tree search from @src_eb + * It should acts as a simplified btrfs_search_slot(). + * The key for search can be extracted from @dst_path->nodes[dst_level] + * (first key). + * + * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty + * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. + * They should be marked during preivous (@dst_level = 1) iteration. + * + * 3) Mark file extents in leaves dirty + * We don't have good way to pick out new file extents only. + * So we still follow the old method by scanning all file extents in + * the leave. + * + * This function can free us from keeping two pathes, thus later we only need + * to care about how to iterate all new tree blocks in reloc tree. + */ +static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int dst_level, int root_level, + bool trace_leaf) +{ + struct btrfs_key key; + struct btrfs_path *src_path; + struct btrfs_fs_info *fs_info = trans->fs_info; + u32 nodesize = fs_info->nodesize; + int cur_level = root_level; + int ret; + + BUG_ON(dst_level > root_level); + /* Level mismatch */ + if (btrfs_header_level(src_eb) != root_level) + return -EINVAL; + + src_path = btrfs_alloc_path(); + if (!src_path) { + ret = -ENOMEM; + goto out; + } + + if (dst_level) + btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + else + btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + + /* For src_path */ + extent_buffer_get(src_eb); + src_path->nodes[root_level] = src_eb; + src_path->slots[root_level] = dst_path->slots[root_level]; + src_path->locks[root_level] = 0; + + /* A simplified version of btrfs_search_slot() */ + while (cur_level >= dst_level) { + struct btrfs_key src_key; + struct btrfs_key dst_key; + + if (src_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + struct extent_buffer *eb; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + eb = src_path->nodes[cur_level + 1]; + parent_slot = src_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + src_path->nodes[cur_level] = eb; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + } + + src_path->slots[cur_level] = dst_path->slots[cur_level]; + if (cur_level) { + btrfs_node_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_node_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } else { + btrfs_item_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_item_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } + /* Content mismatch, something went wrong */ + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { + ret = -ENOENT; + goto out; + } + cur_level--; + } + + /* + * Now both @dst_path and @src_path have been populated, record the tree + * blocks for qgroup accounting. + */ + ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_extent(trans, + dst_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + + /* Record leaf file extents */ + if (dst_level == 0 && trace_leaf) { + ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); + } +out: + btrfs_free_path(src_path); + return ret; +} + +/* + * Helper function to do recursive generation-aware depth-first search, to + * locate all new tree blocks in a subtree of reloc tree. + * + * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) + * reloc tree + * L2 NN (a) + * / \ + * L1 OO NN (b) + * / \ / \ + * L0 OO OO OO NN + * (c) (d) + * If we pass: + * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], + * @cur_level = 1 + * @root_level = 1 + * + * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace + * above tree blocks along with their counter parts in file tree. + * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * won't affect OO(c). + */ +static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int cur_level, int root_level, + u64 last_snapshot, bool trace_leaf) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *eb; + bool need_cleanup = false; + int ret = 0; + int i; + + /* Level sanity check */ + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + root_level < cur_level) { + btrfs_err_rl(fs_info, + "%s: bad levels, cur_level=%d root_level=%d", + __func__, cur_level, root_level); + return -EUCLEAN; + } + + /* Read the tree block if needed */ + if (dst_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* + * dst_path->nodes[root_level] must be initialized before + * calling this function. + */ + if (cur_level == root_level) { + btrfs_err_rl(fs_info, + "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", + __func__, root_level, root_level, cur_level); + return -EUCLEAN; + } + + /* + * We need to get child blockptr/gen from parent before we can + * read it. + */ + eb = dst_path->nodes[cur_level + 1]; + parent_slot = dst_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + /* This node is old, no need to trace */ + if (child_gen < last_snapshot) + goto out; + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + dst_path->nodes[cur_level] = eb; + dst_path->slots[cur_level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + need_cleanup = true; + } + + /* Now record this tree block and its counter part for qgroups */ + ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, + root_level, trace_leaf); + if (ret < 0) + goto cleanup; + + eb = dst_path->nodes[cur_level]; + + if (cur_level > 0) { + /* Iterate all child tree blocks */ + for (i = 0; i < btrfs_header_nritems(eb); i++) { + /* Skip old tree blocks as they won't be swapped */ + if (btrfs_node_ptr_generation(eb, i) < last_snapshot) + continue; + dst_path->slots[cur_level] = i; + + /* Recursive call (at most 7 times) */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, + dst_path, cur_level - 1, root_level, + last_snapshot, trace_leaf); + if (ret < 0) + goto cleanup; + } + } + +cleanup: + if (need_cleanup) { + /* Clean up */ + btrfs_tree_unlock_rw(dst_path->nodes[cur_level], + dst_path->locks[cur_level]); + free_extent_buffer(dst_path->nodes[cur_level]); + dst_path->nodes[cur_level] = NULL; + dst_path->slots[cur_level] = 0; + dst_path->locks[cur_level] = 0; + } +out: + return ret; +} + +/* + * Inform qgroup to trace subtree swap used in balance. + * + * Unlike btrfs_qgroup_trace_subtree(), this function will only trace + * new tree blocks whose generation is equal to (or larger than) @last_snapshot. + * + * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and + * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, + * and then go down @src_eb (pointed by @src_parent and @src_slot) to find + * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * and skip all tree blocks whose generation is smaller than last_snapshot. + * + * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), + * which could be the cause of very slow balance if the file tree is large. + * + * @src_parent, @src_slot: pointer to src (file tree) eb. + * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. + */ +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *dst_path = NULL; + struct btrfs_key first_key; + struct extent_buffer *src_eb = NULL; + struct extent_buffer *dst_eb = NULL; + bool trace_leaf = false; + u64 child_gen; + u64 child_bytenr; + int level; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + /* Check parameter order */ + if (btrfs_node_ptr_generation(src_parent, src_slot) > + btrfs_node_ptr_generation(dst_parent, dst_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, + btrfs_node_ptr_generation(src_parent, src_slot), + btrfs_node_ptr_generation(dst_parent, dst_slot)); + return -EUCLEAN; + } + + /* + * Only trace leaf if we're relocating data block groups, this could + * reduce tons of data extents tracing for meta/sys bg relocation. + */ + if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) + trace_leaf = true; + /* Read out real @src_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(src_parent, src_slot); + child_gen = btrfs_node_ptr_generation(src_parent, src_slot); + btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); + + src_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(src_parent) - 1, &first_key); + if (IS_ERR(src_eb)) { + ret = PTR_ERR(src_eb); + goto out; + } + + /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); + child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); + btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); + + dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(dst_parent) - 1, &first_key); + if (IS_ERR(dst_eb)) { + ret = PTR_ERR(dst_eb); + goto out; + } + + if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + ret = -EINVAL; + goto out; + } + + level = btrfs_header_level(dst_eb); + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + + /* For dst_path */ + extent_buffer_get(dst_eb); + dst_path->nodes[level] = dst_eb; + dst_path->slots[level] = 0; + dst_path->locks[level] = 0; + + /* Do the generation-aware breadth-first search */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, + level, last_snapshot, trace_leaf); + if (ret < 0) + goto out; + ret = 0; + +out: + free_extent_buffer(src_eb); + free_extent_buffer(dst_eb); + btrfs_free_path(dst_path); + if (ret < 0) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2132,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; struct rb_node *node; + u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -2141,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { @@ -2186,6 +2599,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) kfree(record); } + trace_qgroup_num_dirty_extents(fs_info, trans->transid, + num_dirty_extents); return ret; } @@ -2897,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; + qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } @@ -3004,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || - !is_fstree(root->objectid) || len == 0) + !is_fstree(root->root_key.objectid) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -3090,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); ret = freed; out: @@ -3106,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, + &BTRFS_I(inode)->root->fs_info->flags)) + return 0; + /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) @@ -3122,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3215,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid) || num_bytes == 0) + !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -3240,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -3256,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* @@ -3267,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); - btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, + num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -3321,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->objectid, num_bytes); + qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); } /* @@ -3354,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) inode->i_ino, unode->val, unode->aux); } btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 54b8bb282c0e..d8f78f5ab854 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -236,6 +236,12 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); + +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); @@ -249,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return; trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, BTRFS_QGROUP_RSV_DATA); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e5b9e596bb92..d69fbfb30aa9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, INIT_LIST_HEAD(&ra->list); ra->action = action; - ra->root = root->objectid; + ra->root = root->root_key.objectid; /* * This is an allocation, preallocate the block_entry in case we haven't @@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * one we want to lookup below when we modify the * re->num_refs. */ - ref_root = root->objectid; - re->root_objectid = root->objectid; + ref_root = root->root_key.objectid; + re->root_objectid = root->root_key.objectid; re->num_refs = 0; } @@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * didn't thik of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", - root->objectid, be->bytenr); + root->root_key.objectid, be->bytenr); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ra); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8783a1776540..924116f654a1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, int level, u64 bytenr) { struct backref_cache *cache = &rc->backref_cache; - struct btrfs_path *path1; - struct btrfs_path *path2; + struct btrfs_path *path1; /* For searching extent root */ + struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ struct extent_buffer *eb; struct btrfs_root *root; struct backref_node *cur; @@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, struct btrfs_key key; unsigned long end; unsigned long ptr; - LIST_HEAD(list); + LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ LIST_HEAD(useless); int cowonly; int ret; @@ -778,6 +778,10 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, key.type != BTRFS_SHARED_BLOCK_REF_KEY); } + /* + * Parent node found and matches current inline ref, no need to + * rebuild this node for this inline ref. + */ if (exist && ((key.type == BTRFS_TREE_BLOCK_REF_KEY && exist->owner == key.offset) || @@ -787,11 +791,12 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, goto next; } + /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { if (key.objectid == key.offset) { /* - * only root blocks of reloc trees use - * backref of this type. + * Only root blocks of reloc trees use backref + * pointing to itself. */ root = find_reloc_root(rc, cur->bytenr); ASSERT(root); @@ -840,7 +845,11 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, goto next; } - /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset + * means the root objectid. We need to search the tree to get + * its parent bytenr. + */ root = read_fs_root(rc->extent_root->fs_info, key.offset); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -863,10 +872,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, level = cur->level + 1; - /* - * searching the tree to find upper level blocks - * reference the block. - */ + /* Search the tree to find parent blocks referring the block. */ path2->search_commit_root = 1; path2->skip_locking = 1; path2->lowest_level = level; @@ -884,7 +890,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, cur->bytenr) { btrfs_err(root->fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, root->objectid, + cur->bytenr, level - 1, + root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); err = -ENOENT; @@ -892,6 +899,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, } lower = cur; need_check = true; + + /* Add all nodes and edges in the path */ for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == @@ -1281,7 +1290,7 @@ static void __del_reloc_root(struct btrfs_root *root) struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; - if (rc) { + if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, root->node->start); @@ -1735,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * errors, a negative error number is returned. */ static noinline_for_stack -int replace_path(struct btrfs_trans_handle *trans, +int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *dest, struct btrfs_root *src, struct btrfs_path *path, struct btrfs_key *next_key, int lowest_level, int max_level) @@ -1879,14 +1888,9 @@ int replace_path(struct btrfs_trans_handle *trans, * and tree block numbers, if current trans doesn't free * data reloc tree inode. */ - ret = btrfs_qgroup_trace_subtree(trans, parent, - btrfs_header_generation(parent), - btrfs_header_level(parent)); - if (ret < 0) - break; - ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level], - btrfs_header_generation(path->nodes[level]), - btrfs_header_level(path->nodes[level])); + ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group, + parent, slot, path->nodes[level], + path->slots[level], last_snapshot); if (ret < 0) break; @@ -2205,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; } else { - ret = replace_path(trans, root, reloc_root, path, + ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } if (ret < 0) { @@ -2911,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); else @@ -2987,7 +2990,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct backref_node *node; struct btrfs_path *path; struct tree_block *block; - struct rb_node *rb_node; + struct tree_block *next; int ret; int err = 0; @@ -2997,29 +3000,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, goto out_free_blocks; } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); + /* Kick in readahead for tree blocks with missing keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) readahead_tree_block(fs_info, block->bytenr); - rb_node = rb_next(rb_node); } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); + /* Get first keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { err = get_tree_block_key(fs_info, block); if (err) goto out_free_path; } - rb_node = rb_next(rb_node); } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - + /* Do tree relocation */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -3030,11 +3027,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - if (ret != -EAGAIN || rb_node == rb_first(blocks)) + if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) err = ret; goto out; } - rb_node = rb_next(rb_node); } out: err = finish_pending_nodes(trans, rc, path, err); @@ -4669,7 +4665,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, - rc->nodes_relocated, 1); + rc->nodes_relocated, true); if (ret) return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3be1456b5116..902819d3cf41 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (scrub_write_page_to_dev_replace(sblock_other, page_num) != 0) { - btrfs_dev_replace_stats_inc( + atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } @@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) ret = scrub_write_page_to_dev_replace(sblock, page_num); if (ret) - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); } } @@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) struct scrub_page *spage = sbio->pagev[i]; spage->io_error = 1; - btrfs_dev_replace_stats_inc(&dev_replace-> - num_write_errors); + atomic64_inc(&dev_replace->num_write_errors); } } @@ -3022,8 +3019,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length, - int is_dev_replace) + int num, u64 base, u64 length) { struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -3299,7 +3295,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, extent_physical = extent_logical - logical + physical; extent_dev = scrub_dev; extent_mirror_num = mirror_num; - if (is_dev_replace) + if (sctx->is_dev_replace) scrub_remap_extent(fs_info, extent_logical, extent_len, &extent_physical, &extent_dev, @@ -3397,8 +3393,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group_cache *cache, - int is_dev_replace) + struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; @@ -3435,8 +3430,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length, - is_dev_replace); + chunk_offset, length); if (ret) goto out; } @@ -3449,8 +3443,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, - struct btrfs_device *scrub_dev, u64 start, u64 end, - int is_dev_replace) + struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; @@ -3544,7 +3537,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(cache); - if (!ret && is_dev_replace) { + if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks * that started dellaloc right before we set the block @@ -3609,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; btrfs_dev_replace_write_unlock(&fs_info->dev_replace); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, cache, is_dev_replace); + found_key.offset, cache); /* * flush, submit all pending read and write bios, afterwards @@ -3670,7 +3663,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (is_dev_replace && + if (sctx->is_dev_replace && atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; @@ -3893,8 +3886,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } if (!ret) - ret = scrub_enumerate_chunks(sctx, dev, start, end, - is_dev_replace); + ret = scrub_enumerate_chunks(sctx, dev, start, end); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ba8950bfd9c7..094cc1444a90 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) u64 root = (u64)(uintptr_t)key; struct clone_root *cr = (struct clone_root *)elt; - if (root < cr->root->objectid) + if (root < cr->root->root_key.objectid) return -1; - if (root > cr->root->objectid) + if (root > cr->root->root_key.objectid) return 1; return 0; } @@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) struct clone_root *cr1 = (struct clone_root *)e1; struct clone_root *cr2 = (struct clone_root *)e2; - if (cr1->root->objectid < cr2->root->objectid) + if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; - if (cr1->root->objectid > cr2->root->objectid) + if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) return 1; return 0; } @@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root, di = btrfs_lookup_dir_item(NULL, root, path, dir, name, name_len, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); @@ -2346,7 +2342,7 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; } - key.objectid = send_root->objectid; + key.objectid = send_root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2362,7 +2358,7 @@ static int send_subvol_begin(struct send_ctx *sctx) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->objectid) { + key.objectid != send_root->root_key.objectid) { ret = -ENOENT; goto out; } @@ -4907,8 +4903,8 @@ static int send_clone(struct send_ctx *sctx, btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->objectid, clone_root->ino, - clone_root->offset); + offset, len, clone_root->root->root_key.objectid, + clone_root->ino, clone_root->offset); p = fs_path_alloc(); if (!p) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6601c9aa5e35..b362b45dd757 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; + buf->f_fsid.val[0] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; + buf->f_fsid.val[1] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid; return 0; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d9269a531a4d..9e0f4a01be14 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("should have found at least one delalloc"); @@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("couldn't find delalloc in our range"); @@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (found) { test_err("found range when we shouldn't have"); @@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("didn't find our range"); @@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("didn't find our range"); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 385a5316e4bf..bf15d3a7f20e 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; - while (!RB_EMPTY_ROOT(&em_tree->map)) { - node = rb_first(&em_tree->map); + while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { + node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); remove_extent_mapping(em_tree, em); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b84f5015029..5686290a50e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.href_root.rb_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -118,7 +119,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - if (is_fstree(root->objectid)) + if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); } @@ -245,7 +246,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); @@ -759,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -834,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans, info); + btrfs_should_throttle_delayed_refs(trans); cur = max_t(unsigned long, cur, 32); /* @@ -1197,7 +1198,10 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); - btrfs_after_dev_replace_commit(fs_info); + + /* Update dev-replace pointer once everything is committed */ + fs_info->dev_replace.committed_cursor_left = + fs_info->dev_replace.cursor_left_last_write_of_item; return 0; } @@ -1613,10 +1617,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, parent_root, - dentry->d_name.name, dentry->d_name.len, - BTRFS_I(parent_inode), &key, - BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, dentry->d_name.name, + dentry->d_name.len, BTRFS_I(parent_inode), + &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1929,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } + btrfs_trans_release_metadata(trans); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ @@ -1938,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - cur_trans = trans->transaction; /* @@ -2330,7 +2333,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - btrfs_debug(fs_info, "cleaner removing %llu", root->objectid); + btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index db835635372f..cab0b1f1f741 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, u32 nritems = btrfs_header_nritems(leaf); int slot; + if (btrfs_header_level(leaf) != 0) { + generic_err(fs_info, leaf, 0, + "invalid level for leaf, have %d expect 0", + btrfs_header_level(leaf)); + return -EUCLEAN; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; + int level = btrfs_header_level(node); u64 bytenr; int ret = 0; + if (level <= 0 || level >= BTRFS_MAX_LEVEL) { + generic_err(fs_info, node, 0, + "invalid level for node, have %d expect [1, %d]", + level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3c2ae0e4f25a..0dba09334a16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root) * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ -int btrfs_pin_log_trans(struct btrfs_root *root) +void btrfs_pin_log_trans(struct btrfs_root *root) { - int ret = -ENOENT; - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return ret; } /* @@ -258,6 +255,13 @@ struct walk_control { /* what stage of the replay code we're currently in */ int stage; + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in + * the LOG_WALK_REPLAY_INODES stage. + */ + bool ignore_cur_inode; + /* the root we are currently replaying */ struct btrfs_root *replay_dest; @@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + /* + * If we have a tmpfile (O_TMPFILE) that got fsync'ed + * and never got linked before the fsync, skip it, as + * replaying it is pointless since it would be deleted + * later. We skip logging tmpfiles, but it's always + * possible we are replaying a log created with a kernel + * that used to log tmpfiles. + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; + continue; + } else { + wc->ignore_cur_inode = false; + } ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) @@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root->fs_info->sectorsize); ret = btrfs_drop_extents(wc->trans, root, inode, from, (u64)-1, 1); - /* - * If the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done. - */ if (!ret) { - if (inode->i_nlink == 0) - inc_nlink(inode); - /* Update link count and nbytes. */ + /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); } @@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; } + if (wc->ignore_cur_inode) + continue; + if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { ret = replay_one_dir_item(wc->trans, root, path, @@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans, }; ret = walk_log_tree(trans, log, &wc); - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, ret); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -5564,9 +5580,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, dir_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); - /* If parent inode was deleted, skip it. */ - if (IS_ERR(dir_inode)) - continue; + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } if (ctx) ctx->log_new_dentries = false; @@ -5640,7 +5680,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (btrfs_inode_in_log(inode, trans->transid)) { + /* + * Skip already logged inodes or inodes corresponding to tmpfiles + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ + if (btrfs_inode_in_log(inode, trans->transid) || + inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7ab9bb88a639..767765031e59 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -65,7 +65,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); -int btrfs_pin_log_trans(struct btrfs_root *root); +void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, int for_rename); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4405e430da6..f435d397019e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) em_tree = &fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); - n = rb_last(&em_tree->map); + n = rb_last(&em_tree->map.rb_root); if (n) { em = rb_entry(n, struct extent_map, rb_node); ret = em->start + em->len; @@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device, fs_info->fs_devices->latest_bdev = next_device->bdev; } +/* + * Return btrfs_fs_devices::num_devices excluding the device that's being + * currently replaced. + */ +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) +{ + u64 num_devices = fs_info->fs_devices->num_devices; + + btrfs_dev_replace_read_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + ASSERT(num_devices > 1); + num_devices--; + } + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + + return num_devices; +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid) { @@ -1865,22 +1883,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_lock(&uuid_mutex); - num_devices = fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - WARN_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, - &device); - if (ret) + device = btrfs_find_device_by_devspec(fs_info, devid, device_path); + + if (IS_ERR(device)) { + if (PTR_ERR(device) == -ENOENT && + strcmp(device_path, "missing") == 0) + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + else + ret = PTR_ERR(device); goto out; + } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; @@ -2096,9 +2114,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) call_rcu(&tgtdev->rcu, free_device_rcu); } -static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { int ret = 0; struct btrfs_super_block *disk_super; @@ -2106,28 +2123,27 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, u8 *dev_uuid; struct block_device *bdev; struct buffer_head *bh; + struct btrfs_device *device; - *device = NULL; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, fs_info->bdev_holder, 0, &bdev, &bh); if (ret) - return ret; + return ERR_PTR(ret); disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); - if (!*device) - ret = -ENOENT; + if (!device) + device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return ret; + return device; } -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_missing_or_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { - *device = NULL; + struct btrfs_device *device = NULL; if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2136,42 +2152,38 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, list_for_each_entry(tmp, devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tmp->dev_state) && !tmp->bdev) { - *device = tmp; + device = tmp; break; } } - if (!*device) - return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - - return 0; + if (!device) + return ERR_PTR(-ENOENT); } else { - return btrfs_find_device_by_path(fs_info, device_path, device); + device = btrfs_find_device_by_path(fs_info, device_path); } + + return device; } /* * Lookup a device given by device id, or the path if the id is 0. */ -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device) +struct btrfs_device *btrfs_find_device_by_devspec( + struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) { - int ret; + struct btrfs_device *device; if (devid) { - ret = 0; - *device = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!*device) - ret = -ENOENT; + device = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!device) + return ERR_PTR(-ENOENT); } else { if (!devpath || !devpath[0]) - return -EINVAL; - - ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, - device); + return ERR_PTR(-EINVAL); + device = btrfs_find_device_missing_or_by_path(fs_info, devpath); } - return ret; + return device; } /* @@ -3679,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return (flags & (flags - 1)) == 0; + return is_power_of_2(flags); } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } } - num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - BUG_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + num_devices = btrfs_num_devices(fs_info); + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); @@ -5897,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - btrfs_dev_replace_clear_lock_blocking(dev_replace); + ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); + btrfs_dev_replace_read_lock(dev_replace); + /* Barrier implied by atomic_dec_and_test */ + if (atomic_dec_and_test(&dev_replace->blocking_readers)) + cond_wake_up_nomb(&dev_replace->read_lock_wq); btrfs_dev_replace_read_unlock(dev_replace); } free_extent_map(em); @@ -7438,7 +7449,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) int ret = 0; read_lock(&em_tree->lock); - for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { em = rb_entry(node, struct extent_map, rb_node); if (em->map_lookup->num_stripes != em->map_lookup->verified_stripes) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 23e9285d88de..aefce895e994 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -410,12 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device); -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device); +struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, + u64 devid, + const char *devpath); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b401c4e36394..8568946f491d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -316,7 +316,7 @@ DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, ), TP_fast_assign_btrfs(bi->root->fs_info, - __entry->root_obj = bi->root->objectid; + __entry->root_obj = bi->root->root_key.objectid; __entry->ino = btrfs_ino(bi); __entry->isize = bi->vfs_inode.i_size; __entry->disk_isize = bi->disk_i_size; @@ -367,7 +367,7 @@ DECLARE_EVENT_CLASS( TP_fast_assign_btrfs( bi->root->fs_info, - __entry->root_obj = bi->root->objectid; + __entry->root_obj = bi->root->root_key.objectid; __entry->ino = btrfs_ino(bi); __entry->isize = bi->vfs_inode.i_size; __entry->disk_isize = bi->disk_i_size; @@ -1477,7 +1477,8 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, ), TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), - __entry->rootid = BTRFS_I(inode)->root->objectid; + __entry->rootid = + BTRFS_I(inode)->root->root_key.objectid; __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->start = start; __entry->len = len; @@ -1575,6 +1576,27 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, TP_ARGS(fs_info, rec) ); +TRACE_EVENT(qgroup_num_dirty_extents, + + TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, + u64 num_dirty_extents), + + TP_ARGS(fs_info, transid, num_dirty_extents), + + TP_STRUCT__entry_btrfs( + __field( u64, transid ) + __field( u64, num_dirty_extents ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->transid = transid; + __entry->num_dirty_extents = num_dirty_extents; + ), + + TP_printk_btrfs("transid=%llu num_dirty_extents=%llu", + __entry->transid, __entry->num_dirty_extents) +); + TRACE_EVENT(btrfs_qgroup_account_extent, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr, @@ -1675,7 +1697,7 @@ TRACE_EVENT(qgroup_meta_reserve, ), TP_fast_assign_btrfs(root->fs_info, - __entry->refroot = root->objectid; + __entry->refroot = root->root_key.objectid; __entry->diff = diff; ), @@ -1697,7 +1719,7 @@ TRACE_EVENT(qgroup_meta_convert, ), TP_fast_assign_btrfs(root->fs_info, - __entry->refroot = root->objectid; + __entry->refroot = root->root_key.objectid; __entry->diff = diff; ), @@ -1721,7 +1743,7 @@ TRACE_EVENT(qgroup_meta_free_all_pertrans, ), TP_fast_assign_btrfs(root->fs_info, - __entry->refroot = root->objectid; + __entry->refroot = root->root_key.objectid; spin_lock(&root->qgroup_meta_rsv_lock); __entry->diff = -(s64)root->qgroup_meta_rsv_pertrans; spin_unlock(&root->qgroup_meta_rsv_lock); @@ -1802,7 +1824,7 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents, ), TP_fast_assign_btrfs(root->fs_info, - __entry->root_objectid = root->objectid; + __entry->root_objectid = root->root_key.objectid; __entry->ino = ino; __entry->mod = mod; ),