for-5.7-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAl6CDIMACgkQxWXV+ddt
 WDuJ9g/+NTVt+OXAX3G4VLAIR6EjugREAmiHPlojM7scKsmkBuH9BN35+2EPj+yS
 rSmdL01nOH3gyqe+RzAc1EEiujH/9uDpkNf4zE1tGtj9m5Useqj8ZNmiG/BN0PmR
 OJZkVb8DXUHEXIFscHjQJPP60kFZoqIovS7qZbDh4992+p98lTiUUEI6SPanVYeR
 QysXxmafty03hQMFW93ohFZemwAELVVI44nHxxcmOHT5BbIIopXrkInkkchB9I6b
 l+tIJx1gjL6k0D3v/TTqRuD+wGCE8InJgtiuEOf0WkHp2YXUlSDaKAnF/j9Le4oe
 eOgc50LtA3YNGmZ2m5vTeRjBeU9qUPWjJWJ2urp87oIrxX5x7B5Hsjxdnn28P0yZ
 dl/dt9HxeCKFgaRrMZYETYq9VBt0IMxiOIG9w5fukB9qnC6Dd05dXyQB0slg0+l1
 chn5p0FtMS74cvXB32jW7N0fwxWNt6KI4zBvomabJGYZQd6+dyDO8l8Od86vvve/
 w7KgRy7CFBjc9JOCyLTvS8eEhu/qAVc07phSblpdNnyzPFjWWTdZySON/qQYvUCf
 cGDiq+5+1d1+kWuEjtYNzvxon2AaAfg7UBZm5FrjN735ojTQXqm2vi3rrurcU5AZ
 ItmiU6DMre5EGZ+hfWgSPXDkeqx/JYbtDuUwWbNg6svTXaKKnmI=
 =1m9l
 -----END PGP SIGNATURE-----

Merge tag 'for-5.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "A number of core changes that make things work better in general, code
  is simpler and cleaner.

  Core changes:

   - per-inode file extent tree, for in memory tracking of contiguous
     extent ranges to make sure i_size adjustments are accurate

   - tree root structures are protected by reference counts, replacing
     SRCU that did not cover some cases

   - leak detector for tree root structures

   - per-transaction pinned extent tracking

   - buffer heads are replaced by bios for super block access

   - speedup of extent back reference resolution, on an example test
     scenario the runtime of send went down from a hour to minutes

   - factor out locking scheme used for subvolume writer and NOCOW
     exclusion, abstracted as DREW lock, double reader-writer exclusion
     (allow either readers or writers)

   - cleanup and abstract extent allocation policies, preparation for
     zoned device support

   - make reflink/clone_range work on inline extents

   - add more cancellation point for relocation, improves long response
     from 'balance cancel'

   - add page migration callback for data pages

   - switch to guid for uuids, with additional cleanups of the interface

   - make ranged full fsyncs more efficient

   - removal of obsolete ioctl flag BTRFS_SUBVOL_CREATE_ASYNC

   - remove b-tree readahead from delayed refs paths, avoiding seek and
     read unnecessary blocks

  Features:

   - v2 of ioctl to delete subvolumes, allowing to delete by id and more
     future extensions

  Fixes:

   - fix qgroup rescan worker that could block umount

   - fix crash during unmount due to race with delayed inode workers

   - fix dellaloc flushing logic that could create unnecessary chunks
     under heavy load

   - fix missing file extent item for hole after ranged fsync

   - several fixes in relocation error handling

  Other:

   - more documentation of relocation, device replace, space
     reservations

   - many random cleanups"

* tag 'for-5.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (210 commits)
  btrfs: fix missing semaphore unlock in btrfs_sync_file
  btrfs: use nofs allocations for running delayed items
  btrfs: sysfs: Use scnprintf() instead of snprintf()
  btrfs: do not resolve backrefs for roots that are being deleted
  btrfs: track reloc roots based on their commit root bytenr
  btrfs: restart relocate_tree_blocks properly
  btrfs: reloc: reorder reservation before root selection
  btrfs: do not readahead in build_backref_tree
  btrfs: do not use readahead for running delayed refs
  btrfs: Remove async_transid from btrfs_mksubvol/create_subvol/create_snapshot
  btrfs: Remove transid argument from btrfs_ioctl_snap_create_transid
  btrfs: Remove BTRFS_SUBVOL_CREATE_ASYNC support
  btrfs: kill the subvol_srcu
  btrfs: make btrfs_cleanup_fs_roots use the radix tree lock
  btrfs: don't take an extra root ref at allocation time
  btrfs: hold a ref on the root on the dead roots list
  btrfs: make inodes hold a ref on their roots
  btrfs: move the root freeing stuff into btrfs_put_root
  btrfs: move ino_cache_inode dropping out of btrfs_free_fs_root
  btrfs: make the extent buffer leak check per fs info
  ...
This commit is contained in:
Linus Torvalds 2020-03-31 13:00:16 -07:00
commit 15c981d16d
65 changed files with 4470 additions and 3544 deletions

View File

@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

View File

@ -395,3 +395,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
if (wq->high)
flush_workqueue(wq->high->normal_wq);
flush_workqueue(wq->normal->normal_wq);
}

View File

@ -44,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
#endif

View File

@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
return -ENOMEM;
ref->root_id = root_id;
if (key) {
if (key)
ref->key_for_search = *key;
/*
* We can often find data backrefs with an offset that is too
* large (>= LLONG_MAX, maximum allowed file offset) due to
* underflows when subtracting a file's offset with the data
* offset of its corresponding extent data item. This can
* happen for example in the clone ioctl.
* So if we detect such case we set the search key's offset to
* zero to make sure we will find the matching file extent item
* at add_all_parents(), otherwise we will miss it because the
* offset taken form the backref is much larger then the offset
* of the file extent item. This can make us scan a very large
* number of file extent items, but at least it will not make
* us miss any.
* This is an ugly workaround for a behaviour that should have
* never existed, but it does and a fix for the clone ioctl
* would touch a lot of places, cause backwards incompatibility
* and would not fix the problem for extents cloned with older
* kernels.
*/
if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
ref->key_for_search.offset >= LLONG_MAX)
ref->key_for_search.offset = 0;
} else {
else
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
}
ref->inode_list = NULL;
ref->level = level;
@ -409,10 +386,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
wanted_disk_byte, count, sc, gfp_mask);
}
static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
{
struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct prelim_ref *ref = NULL;
struct prelim_ref target = {0};
int result;
target.parent = bytenr;
while (*p) {
parent = *p;
ref = rb_entry(parent, struct prelim_ref, rbnode);
result = prelim_ref_compare(ref, &target);
if (result < 0)
p = &(*p)->rb_left;
else if (result > 0)
p = &(*p)->rb_right;
else
return 1;
}
return 0;
}
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents, struct prelim_ref *ref,
struct ulist *parents,
struct preftrees *preftrees, struct prelim_ref *ref,
int level, u64 time_seq, const u64 *extent_item_pos,
u64 total_refs, bool ignore_offset)
bool ignore_offset)
{
int ret = 0;
int slot;
@ -424,6 +427,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
u64 disk_byte;
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
u64 data_offset;
if (level != 0) {
eb = path->nodes[level];
@ -434,18 +438,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
/*
* We normally enter this function with the path already pointing to
* the first item to check. But sometimes, we may enter it with
* slot==nritems. In that case, go to the next leaf before we continue.
* 1. We normally enter this function with the path already pointing to
* the first item to check. But sometimes, we may enter it with
* slot == nritems.
* 2. We are searching for normal backref but bytenr of this leaf
* matches shared data backref
* 3. The leaf owner is not equal to the root we are searching
*
* For these cases, go to the next leaf before we continue.
*/
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
eb = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
}
while (!ret && count < total_refs) {
while (!ret && count < ref->count) {
eb = path->nodes[0];
slot = path->slots[0];
@ -455,13 +467,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
key.type != BTRFS_EXTENT_DATA_KEY)
break;
/*
* We are searching for normal backref but bytenr of this leaf
* matches shared data backref, OR
* the leaf owner is not equal to the root we are searching for
*/
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
continue;
}
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
data_offset = btrfs_file_extent_offset(eb, fi);
if (disk_byte == wanted_disk_byte) {
eie = NULL;
old = NULL;
count++;
if (ref->key_for_search.offset == key.offset - data_offset)
count++;
else
goto next;
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
@ -502,9 +532,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
*/
static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct preftrees *preftrees,
struct prelim_ref *ref, struct ulist *parents,
const u64 *extent_item_pos, u64 total_refs,
bool ignore_offset)
const u64 *extent_item_pos, bool ignore_offset)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@ -512,23 +542,25 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int ret = 0;
int root_level;
int level = ref->level;
int index;
struct btrfs_key search_key = ref->key_for_search;
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
root = btrfs_get_fs_root(fs_info, &root_key, false);
if (IS_ERR(root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
goto out_free;
}
if (!path->search_commit_root &&
test_bit(BTRFS_ROOT_DELETING, &root->state)) {
ret = -ENOENT;
goto out;
}
if (btrfs_is_testing(fs_info)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -ENOENT;
goto out;
}
@ -540,21 +572,36 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
else
root_level = btrfs_old_root_level(root, time_seq);
if (root_level + 1 == level) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
if (root_level + 1 == level)
goto out;
}
/*
* We can often find data backrefs with an offset that is too large
* (>= LLONG_MAX, maximum allowed file offset) due to underflows when
* subtracting a file's offset with the data offset of its
* corresponding extent data item. This can happen for example in the
* clone ioctl.
*
* So if we detect such case we set the search key's offset to zero to
* make sure we will find the matching file extent item at
* add_all_parents(), otherwise we will miss it because the offset
* taken form the backref is much larger then the offset of the file
* extent item. This can make us scan a very large number of file
* extent items, but at least it will not make us miss any.
*
* This is an ugly workaround for a behaviour that should have never
* existed, but it does and a fix for the clone ioctl would touch a lot
* of places, cause backwards incompatibility and would not fix the
* problem for extents cloned with older kernels.
*/
if (search_key.type == BTRFS_EXTENT_DATA_KEY &&
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
if (time_seq == SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
0, 0);
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
time_seq);
/* root node has been locked, we can release @subvol_srcu safely here */
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
btrfs_debug(fs_info,
"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
@ -574,9 +621,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
ret = add_all_parents(root, path, parents, ref, level, time_seq,
extent_item_pos, total_refs, ignore_offset);
ret = add_all_parents(root, path, parents, preftrees, ref, level,
time_seq, extent_item_pos, ignore_offset);
out:
btrfs_put_root(root);
out_free:
path->lowest_level = 0;
btrfs_release_path(path);
return ret;
@ -609,7 +658,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct preftrees *preftrees,
const u64 *extent_item_pos, u64 total_refs,
const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
int err;
@ -653,9 +702,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ret = BACKREF_FOUND_SHARED;
goto out;
}
err = resolve_indirect_ref(fs_info, path, time_seq, ref,
parents, extent_item_pos,
total_refs, ignore_offset);
err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
ref, parents, extent_item_pos,
ignore_offset);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@ -758,8 +807,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head, u64 seq,
struct preftrees *preftrees, u64 *total_refs,
struct share_check *sc)
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
@ -793,7 +841,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
default:
BUG();
}
*total_refs += count;
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
@ -876,7 +923,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
static int add_inline_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct preftrees *preftrees,
u64 *total_refs, struct share_check *sc)
struct share_check *sc)
{
int ret = 0;
int slot;
@ -900,7 +947,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
*total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@ -1125,8 +1171,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct prelim_ref *ref;
struct rb_node *node;
struct extent_inode_elem *eie = NULL;
/* total of both direct AND indirect refs! */
u64 total_refs = 0;
struct preftrees preftrees = {
.direct = PREFTREE_INIT,
.indirect = PREFTREE_INIT,
@ -1195,7 +1239,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
}
spin_unlock(&delayed_refs->lock);
ret = add_delayed_refs(fs_info, head, time_seq,
&preftrees, &total_refs, sc);
&preftrees, sc);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@ -1216,8 +1260,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = add_inline_refs(fs_info, path, bytenr,
&info_level, &preftrees,
&total_refs, sc);
&info_level, &preftrees, sc);
if (ret)
goto out;
ret = add_keyed_refs(fs_info, path, bytenr, info_level,
@ -1236,7 +1279,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
extent_item_pos, total_refs, sc, ignore_offset);
extent_item_pos, sc, ignore_offset);
if (ret)
goto out;
@ -1362,10 +1405,10 @@ static void free_leaf_list(struct ulist *blocks)
*
* returns 0 on success, <0 on error
*/
static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos, bool ignore_offset)
int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos, bool ignore_offset)
{
int ret;

View File

@ -40,6 +40,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots, bool ignore_offset);

View File

@ -460,7 +460,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
int ret;
while (start < end) {
ret = find_first_extent_bit(info->pinned_extents, start,
ret = find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY | EXTENT_UPTODATE,
NULL);
@ -1248,6 +1248,55 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
return ret;
}
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
struct btrfs_transaction *prev_trans = NULL;
const u64 start = bg->start;
const u64 end = start + bg->length - 1;
int ret;
spin_lock(&fs_info->trans_lock);
if (trans->transaction->list.prev != &fs_info->trans_list) {
prev_trans = list_last_entry(&trans->transaction->list,
struct btrfs_transaction, list);
refcount_inc(&prev_trans->use_count);
}
spin_unlock(&fs_info->trans_lock);
/*
* Hold the unused_bg_unpin_mutex lock to avoid racing with
* btrfs_finish_extent_commit(). If we are at transaction N, another
* task might be running finish_extent_commit() for the previous
* transaction N - 1, and have seen a range belonging to the block
* group in pinned_extents before we were able to clear the whole block
* group range from pinned_extents. This means that task can lookup for
* the block group after we unpinned it from pinned_extents and removed
* it, leading to a BUG_ON() at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
EXTENT_DIRTY);
if (ret)
goto err;
}
ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
EXTENT_DIRTY);
if (ret)
goto err;
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
return true;
err:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(bg);
return false;
}
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@ -1265,7 +1314,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 start, end;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@ -1344,35 +1392,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
start = block_group->start;
end = start + block_group->length - 1;
/*
* Hold the unused_bg_unpin_mutex lock to avoid racing with
* btrfs_finish_extent_commit(). If we are at transaction N,
* another task might be running finish_extent_commit() for the
* previous transaction N - 1, and have seen a range belonging
* to the block group in freed_extents[] before we were able to
* clear the whole block group range from freed_extents[]. This
* means that task can lookup for the block group after we
* unpinned it from freed_extents[] and removed it, leading to
* a BUG_ON() at btrfs_unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(block_group);
if (!clean_pinned_extents(trans, block_group))
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_dec_block_group_ro(block_group);
goto end_trans;
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/*
* At this point, the block_group is read only and should fail
@ -1987,6 +2008,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
rcu_read_lock();
list_for_each_entry_rcu(space_info, &info->space_info, list) {
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
@ -2007,6 +2029,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@ -2345,7 +2368,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
return 0;
}
if (trans->aborted)
if (TRANS_ABORTED(trans))
return 0;
again:
inode = lookup_free_space_inode(block_group, path);
@ -2881,7 +2904,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
&cache->space_info->total_bytes_pinned,
num_bytes,
BTRFS_TOTAL_BYTES_PINNED_BATCH);
set_extent_dirty(info->pinned_extents,
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
}

View File

@ -6,6 +6,98 @@
#include "space-info.h"
#include "transaction.h"
/*
* HOW DO BLOCK RESERVES WORK
*
* Think of block_rsv's as buckets for logically grouped metadata
* reservations. Each block_rsv has a ->size and a ->reserved. ->size is
* how large we want our block rsv to be, ->reserved is how much space is
* currently reserved for this block reserve.
*
* ->failfast exists for the truncate case, and is described below.
*
* NORMAL OPERATION
*
* -> Reserve
* Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
*
* We call into btrfs_reserve_metadata_bytes() with our bytes, which is
* accounted for in space_info->bytes_may_use, and then add the bytes to
* ->reserved, and ->size in the case of btrfs_block_rsv_add.
*
* ->size is an over-estimation of how much we may use for a particular
* operation.
*
* -> Use
* Entrance: btrfs_use_block_rsv
*
* When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
* to determine the appropriate block_rsv to use, and then verify that
* ->reserved has enough space for our tree block allocation. Once
* successful we subtract fs_info->nodesize from ->reserved.
*
* -> Finish
* Entrance: btrfs_block_rsv_release
*
* We are finished with our operation, subtract our individual reservation
* from ->size, and then subtract ->size from ->reserved and free up the
* excess if there is any.
*
* There is some logic here to refill the delayed refs rsv or the global rsv
* as needed, otherwise the excess is subtracted from
* space_info->bytes_may_use.
*
* TYPES OF BLOCK RESERVES
*
* BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
* These behave normally, as described above, just within the confines of the
* lifetime of their particular operation (transaction for the whole trans
* handle lifetime, for example).
*
* BLOCK_RSV_GLOBAL
* It is impossible to properly account for all the space that may be required
* to make our extent tree updates. This block reserve acts as an overflow
* buffer in case our delayed refs reserve does not reserve enough space to
* update the extent tree.
*
* We can steal from this in some cases as well, notably on evict() or
* truncate() in order to help users recover from ENOSPC conditions.
*
* BLOCK_RSV_DELALLOC
* The individual item sizes are determined by the per-inode size
* calculations, which are described with the delalloc code. This is pretty
* straightforward, it's just the calculation of ->size encodes a lot of
* different items, and thus it gets used when updating inodes, inserting file
* extents, and inserting checksums.
*
* BLOCK_RSV_DELREFS
* We keep a running tally of how many delayed refs we have on the system.
* We assume each one of these delayed refs are going to use a full
* reservation. We use the transaction items and pre-reserve space for every
* operation, and use this reservation to refill any gap between ->size and
* ->reserved that may exist.
*
* From there it's straightforward, removing a delayed ref means we remove its
* count from ->size and free up reservations as necessary. Since this is
* the most dynamic block reserve in the system, we will try to refill this
* block reserve first with any excess returned by any other block reserve.
*
* BLOCK_RSV_EMPTY
* This is the fallback block reserve to make us try to reserve space if we
* don't have a specific bucket for this allocation. It is mostly used for
* updating the device tree and such, since that is a separate pool we're
* content to just reserve space from the space_info on demand.
*
* BLOCK_RSV_TEMP
* This is used by things like truncate and iput. We will temporarily
* allocate a block reserve, set it to some size, and then truncate bytes
* until we have no space left. With ->failfast set we'll simply return
* ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
* to make a new reservation. This is because these operations are
* unbounded, so we want to do as much work as we can, and then back off and
* re-reserve.
*/
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes,
@ -111,7 +203,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
{
if (!rsv)
return;
btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
kfree(rsv);
}
@ -178,9 +270,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release)
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
u64 *qgroup_to_release)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
@ -297,9 +389,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
block_rsv->reserved += num_bytes;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
@ -344,7 +436,8 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
NULL);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);

View File

@ -73,7 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release);
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
@ -82,20 +82,12 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
{
__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
}
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u32 blocksize)
{
btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
btrfs_block_rsv_release(fs_info, block_rsv, 0);
btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
#endif /* BTRFS_BLOCK_RSV_H */

View File

@ -60,6 +60,12 @@ struct btrfs_inode {
*/
struct extent_io_tree io_failure_tree;
/*
* Keep track of where the inode has extent items mapped in order to
* make sure the i_size adjustments are accurate
*/
struct extent_io_tree file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;

View File

@ -77,7 +77,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
@ -152,11 +151,8 @@ struct btrfsic_block {
struct list_head ref_to_list; /* list */
struct list_head ref_from_list; /* list */
struct btrfsic_block *next_in_same_bio;
void *orig_bio_bh_private;
union {
bio_end_io_t *bio;
bh_end_io_t *bh;
} orig_bio_bh_end_io;
void *orig_bio_private;
bio_end_io_t *orig_bio_end_io;
int submit_bio_bh_rw;
u64 flush_gen; /* only valid if !never_written */
};
@ -325,14 +321,12 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
struct buffer_head *bh,
int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
static void btrfsic_bio_end_io(struct bio *bp);
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int recursion_level);
@ -399,8 +393,8 @@ static void btrfsic_block_init(struct btrfsic_block *b)
b->never_written = 0;
b->mirror_num = 0;
b->next_in_same_bio = NULL;
b->orig_bio_bh_private = NULL;
b->orig_bio_bh_end_io.bio = NULL;
b->orig_bio_private = NULL;
b->orig_bio_end_io = NULL;
INIT_LIST_HEAD(&b->collision_resolving_node);
INIT_LIST_HEAD(&b->all_blocks_node);
INIT_LIST_HEAD(&b->ref_to_list);
@ -767,29 +761,31 @@ static int btrfsic_process_superblock_dev_mirror(
struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *super_tmp;
u64 dev_bytenr;
struct buffer_head *bh;
struct btrfsic_block *superblock_tmp;
int pass;
struct block_device *const superblock_bdev = device->bdev;
struct page *page;
struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
int ret = 0;
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE,
BTRFS_SUPER_INFO_SIZE);
if (NULL == bh)
page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return -1;
super_tmp = (struct btrfs_super_block *)
(bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1)));
super_tmp = page_address(page);
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
ret = 0;
goto out;
}
superblock_tmp =
@ -800,8 +796,8 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
pr_info("btrfsic: error, kmalloc failed!\n");
brelse(bh);
return -1;
ret = -1;
goto out;
}
/* for superblock, only the dev_bytenr makes sense */
superblock_tmp->dev_bytenr = dev_bytenr;
@ -885,8 +881,8 @@ static int btrfsic_process_superblock_dev_mirror(
mirror_num)) {
pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
next_bytenr, mirror_num);
brelse(bh);
return -1;
ret = -1;
goto out;
}
next_block = btrfsic_block_lookup_or_add(
@ -895,8 +891,8 @@ static int btrfsic_process_superblock_dev_mirror(
mirror_num, NULL);
if (NULL == next_block) {
btrfsic_release_block_ctx(&tmp_next_block_ctx);
brelse(bh);
return -1;
ret = -1;
goto out;
}
next_block->disk_key = tmp_disk_key;
@ -907,16 +903,17 @@ static int btrfsic_process_superblock_dev_mirror(
BTRFSIC_GENERATION_UNKNOWN);
btrfsic_release_block_ctx(&tmp_next_block_ctx);
if (NULL == l) {
brelse(bh);
return -1;
ret = -1;
goto out;
}
}
}
if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
btrfsic_dump_tree_sub(state, superblock_tmp, 0);
brelse(bh);
return 0;
out:
put_page(page);
return ret;
}
static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
@ -1743,7 +1740,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
struct buffer_head *bh,
int submit_bio_bh_rw)
{
int is_metadata;
@ -1902,9 +1898,9 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
block->orig_bio_bh_private =
block->orig_bio_private =
bio->bi_private;
block->orig_bio_bh_end_io.bio =
block->orig_bio_end_io =
bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
@ -1916,25 +1912,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
bio->bi_private;
BUG_ON(NULL == chained_block);
block->orig_bio_bh_private =
chained_block->orig_bio_bh_private;
block->orig_bio_bh_end_io.bio =
chained_block->orig_bio_bh_end_io.
bio;
block->orig_bio_private =
chained_block->orig_bio_private;
block->orig_bio_end_io =
chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
} else if (NULL != bh) {
block->is_iodone = 0;
block->orig_bio_bh_private = bh->b_private;
block->orig_bio_bh_end_io.bh = bh->b_end_io;
block->next_in_same_bio = NULL;
bh->b_private = block;
bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
block->orig_bio_bh_private = NULL;
block->orig_bio_bh_end_io.bio = NULL;
block->orig_bio_private = NULL;
block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
}
@ -2042,8 +2030,8 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
block->orig_bio_bh_private = bio->bi_private;
block->orig_bio_bh_end_io.bio = bio->bi_end_io;
block->orig_bio_private = bio->bi_private;
block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
@ -2054,24 +2042,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
bio->bi_private;
BUG_ON(NULL == chained_block);
block->orig_bio_bh_private =
chained_block->orig_bio_bh_private;
block->orig_bio_bh_end_io.bio =
chained_block->orig_bio_bh_end_io.bio;
block->orig_bio_private =
chained_block->orig_bio_private;
block->orig_bio_end_io =
chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
} else if (NULL != bh) {
block->is_iodone = 0;
block->orig_bio_bh_private = bh->b_private;
block->orig_bio_bh_end_io.bh = bh->b_end_io;
block->next_in_same_bio = NULL;
bh->b_private = block;
bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
block->orig_bio_bh_private = NULL;
block->orig_bio_bh_end_io.bio = NULL;
block->orig_bio_private = NULL;
block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
@ -2112,8 +2093,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
iodone_w_error = 1;
BUG_ON(NULL == block);
bp->bi_private = block->orig_bio_bh_private;
bp->bi_end_io = block->orig_bio_bh_end_io.bio;
bp->bi_private = block->orig_bio_private;
bp->bi_end_io = block->orig_bio_end_io;
do {
struct btrfsic_block *next_block;
@ -2146,38 +2127,6 @@ static void btrfsic_bio_end_io(struct bio *bp)
bp->bi_end_io(bp);
}
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
{
struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
int iodone_w_error = !uptodate;
struct btrfsic_dev_state *dev_state;
BUG_ON(NULL == block);
dev_state = block->dev_state;
if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
iodone_w_error,
btrfsic_get_block_type(dev_state->state, block),
block->logical_bytenr, block->dev_state->name,
block->dev_bytenr, block->mirror_num);
block->iodone_w_error = iodone_w_error;
if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
pr_info("bh_end_io() new %s flush_gen=%llu\n",
dev_state->name, dev_state->last_flush_gen);
}
if (block->submit_bio_bh_rw & REQ_FUA)
block->flush_gen = 0; /* FUA completed means block is on disk */
bh->b_private = block->orig_bio_bh_private;
bh->b_end_io = block->orig_bio_bh_end_io.bh;
block->is_iodone = 1; /* for FLUSH, this releases the block */
bh->b_end_io(bh, uptodate);
}
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const superblock,
@ -2730,63 +2679,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
{
struct btrfsic_dev_state *dev_state;
if (!btrfsic_is_initialized)
return submit_bh(op, op_flags, bh);
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bh() might also be called before
* btrfsic_mount(), this might return NULL */
dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev);
/* Only called to write the superblock (incl. FLUSH/FUA) */
if (NULL != dev_state &&
(op == REQ_OP_WRITE) && bh->b_size > 0) {
u64 dev_bytenr;
dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n",
op, op_flags, (unsigned long long)bh->b_blocknr,
dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
btrfsic_process_written_block(dev_state, dev_bytenr,
&bh->b_data, 1, NULL,
NULL, bh, op_flags);
} else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n",
op, op_flags, bh->b_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n",
dev_state->name);
} else {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
block->is_iodone = 0;
block->never_written = 0;
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
block->submit_bio_bh_rw = op_flags;
block->orig_bio_bh_private = bh->b_private;
block->orig_bio_bh_end_io.bh = bh->b_end_io;
block->next_in_same_bio = NULL;
bh->b_private = block;
bh->b_end_io = btrfsic_bh_end_io;
}
}
mutex_unlock(&btrfsic_mutex);
return submit_bh(op, op_flags, bh);
}
static void __btrfsic_submit_bio(struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
@ -2838,7 +2730,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, segs,
bio, &bio_is_patched,
NULL, bio->bi_opf);
bio->bi_opf);
bio_for_each_segment(bvec, bio, iter)
kunmap(bvec.bv_page);
kfree(mapped_datav);
@ -2862,8 +2754,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
block->submit_bio_bh_rw = bio->bi_opf;
block->orig_bio_bh_private = bio->bi_private;
block->orig_bio_bh_end_io.bio = bio->bi_end_io;
block->orig_bio_private = bio->bi_private;
block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;

View File

@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);
void btrfsic_submit_bio(struct bio *bio);
int btrfsic_submit_bio_wait(struct bio *bio);
#else
#define btrfsic_submit_bh submit_bh
#define btrfsic_submit_bio submit_bio
#define btrfsic_submit_bio_wait submit_bio_wait
#endif

View File

@ -31,8 +31,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
static const struct btrfs_csums {
u16 size;
const char *name;
const char *driver;
const char name[10];
const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
@ -63,7 +63,8 @@ const char *btrfs_super_csum_name(u16 csum_type)
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csums[csum_type].driver ?:
return btrfs_csums[csum_type].driver[0] ?
btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
@ -143,44 +144,6 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
return eb;
}
/* loop around taking references on and locking the root node of the
* tree until you end up with a lock on the root. A locked buffer
* is returned, with a reference held.
*/
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_lock(eb);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/* loop around taking references on and locking the root node of the
* tree until you end up with a lock on the root. A locked buffer
* is returned, with a reference held.
*/
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/* cowonly root (everything not a reference counted cow subvolume), just get
* put onto a simple dirty list. transaction.c walks this to make sure they
* get properly updated on disk.
@ -341,7 +304,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
struct seq_list *cur_elem;
struct tree_mod_elem *tm;
u64 min_seq = (u64)-1;
u64 seq_putting = elem->seq;
@ -353,18 +315,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
list_del(&elem->list);
elem->seq = 0;
list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
if (cur_elem->seq < min_seq) {
if (seq_putting > cur_elem->seq) {
/*
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = cur_elem->seq;
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct seq_list *first;
first = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
if (seq_putting > first->seq) {
/*
* Blocker with lower sequence number exists, we
* cannot remove anything from the log.
*/
write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = first->seq;
}
/*
@ -962,9 +926,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (new_flags != 0) {
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans,
buf->start,
buf->len,
ret = btrfs_set_disk_extent_flags(trans, buf,
new_flags, level, 0);
if (ret)
return ret;

View File

@ -33,6 +33,7 @@
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
#include "locking.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@ -596,8 +597,8 @@ struct btrfs_fs_info {
/* keep track of unallocated space */
atomic64_t free_chunk_space;
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
/* Track ranges which are used by log trees blocks/logged data extents */
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
struct extent_map_tree mapping_tree;
@ -696,7 +697,6 @@ struct btrfs_fs_info {
struct rw_semaphore cleanup_work_sem;
struct rw_semaphore subvol_sem;
struct srcu_struct subvol_srcu;
spinlock_t trans_lock;
/*
@ -947,6 +947,10 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct kobject *discard_debug_kobj;
struct list_head allocated_roots;
spinlock_t eb_leak_lock;
struct list_head allocated_ebs;
#endif
};
@ -955,11 +959,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
struct btrfs_subvolume_writers {
struct percpu_counter counter;
wait_queue_head_t wait;
};
/*
* The state of btrfs root
*/
@ -1131,8 +1130,9 @@ struct btrfs_root {
* root_item_lock.
*/
int dedupe_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
/* For exclusion of snapshot creation and nocow writes */
struct btrfs_drew_lock snapshot_lock;
atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
@ -1149,6 +1149,10 @@ struct btrfs_root {
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
struct btrfs_clone_extent_info {
@ -1971,16 +1975,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
btrfs_set_header_flags(eb, flags);
}
static inline unsigned long btrfs_header_fsid(void)
{
return offsetof(struct btrfs_header, fsid);
}
static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
{
return offsetof(struct btrfs_header, chunk_tree_uuid);
}
static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
@ -2458,9 +2452,9 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num, int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
@ -2490,13 +2484,13 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 flags,
struct extent_buffer *eb, u64 flags,
int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start,
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
@ -2665,9 +2659,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(struct extent_buffer *leaf);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
int update_ref, int for_reloc);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@ -2695,23 +2688,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
kfree(fs_info->quota_root);
kfree(fs_info->uuid_root);
kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kvfree(fs_info);
}
/* tree mod log functions from ctree.c */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
@ -2750,9 +2726,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
u64));
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@ -2859,6 +2833,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct btrfs_file_extent_item *fi,
const bool new_inline,
struct extent_map *em);
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
@ -2996,9 +2976,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@ -3008,6 +2985,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
@ -3401,6 +3380,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,

View File

@ -9,6 +9,108 @@
#include "qgroup.h"
#include "block-group.h"
/*
* HOW DOES THIS WORK
*
* There are two stages to data reservations, one for data and one for metadata
* to handle the new extents and checksums generated by writing data.
*
*
* DATA RESERVATION
* The general flow of the data reservation is as follows
*
* -> Reserve
* We call into btrfs_reserve_data_bytes() for the user request bytes that
* they wish to write. We make this reservation and add it to
* space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
* for the range and carry on if this is buffered, or follow up trying to
* make a real allocation if we are pre-allocating or doing O_DIRECT.
*
* -> Use
* At writepages()/prealloc/O_DIRECT time we will call into
* btrfs_reserve_extent() for some part or all of this range of bytes. We
* will make the allocation and subtract space_info->bytes_may_use by the
* original requested length and increase the space_info->bytes_reserved by
* the allocated length. This distinction is important because compression
* may allocate a smaller on disk extent than we previously reserved.
*
* -> Allocation
* finish_ordered_io() will insert the new file extent item for this range,
* and then add a delayed ref update for the extent tree. Once that delayed
* ref is written the extent size is subtracted from
* space_info->bytes_reserved and added to space_info->bytes_used.
*
* Error handling
*
* -> By the reservation maker
* This is the simplest case, we haven't completed our operation and we know
* how much we reserved, we can simply call
* btrfs_free_reserved_data_space*() and it will be removed from
* space_info->bytes_may_use.
*
* -> After the reservation has been made, but before cow_file_range()
* This is specifically for the delalloc case. You must clear
* EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
* be subtracted from space_info->bytes_may_use.
*
* METADATA RESERVATION
* The general metadata reservation lifetimes are discussed elsewhere, this
* will just focus on how it is used for delalloc space.
*
* We keep track of two things on a per inode bases
*
* ->outstanding_extents
* This is the number of file extent items we'll need to handle all of the
* outstanding DELALLOC space we have in this inode. We limit the maximum
* size of an extent, so a large contiguous dirty area may require more than
* one outstanding_extent, which is why count_max_extents() is used to
* determine how many outstanding_extents get added.
*
* ->csum_bytes
* This is essentially how many dirty bytes we have for this inode, so we
* can calculate the number of checksum items we would have to add in order
* to checksum our outstanding data.
*
* We keep a per-inode block_rsv in order to make it easier to keep track of
* our reservation. We use btrfs_calculate_inode_block_rsv_size() to
* calculate the current theoretical maximum reservation we would need for the
* metadata for this inode. We call this and then adjust our reservation as
* necessary, either by attempting to reserve more space, or freeing up excess
* space.
*
* OUTSTANDING_EXTENTS HANDLING
*
* ->outstanding_extents is used for keeping track of how many extents we will
* need to use for this inode, and it will fluctuate depending on where you are
* in the life cycle of the dirty data. Consider the following normal case for
* a completely clean inode, with a num_bytes < our maximum allowed extent size
*
* -> reserve
* ->outstanding_extents += 1 (current value is 1)
*
* -> set_delalloc
* ->outstanding_extents += 1 (currrent value is 2)
*
* -> btrfs_delalloc_release_extents()
* ->outstanding_extents -= 1 (current value is 1)
*
* We must call this once we are done, as we hold our reservation for the
* duration of our operation, and then assume set_delalloc will update the
* counter appropriately.
*
* -> add ordered extent
* ->outstanding_extents += 1 (current value is 2)
*
* -> btrfs_clear_delalloc_extent
* ->outstanding_extents -= 1 (current value is 1)
*
* -> finish_ordered_io/btrfs_remove_ordered_extent
* ->outstanding_extents -= 1 (current value is 0)
*
* Each stage is responsible for their own accounting of the extent, thus
* making error handling and cleanup easier.
*/
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
@ -228,8 +330,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
&qgroup_to_release);
released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
&qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);

View File

@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
@ -595,8 +596,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
btrfs_block_rsv_release(fs_info, rsv,
item->bytes_reserved);
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
static int btrfs_delayed_inode_reserve_metadata(
@ -677,8 +677,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
rsv = &fs_info->delayed_block_rsv;
trace_btrfs_space_reservation(fs_info, "delayed_inode",
node->inode_id, node->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv,
node->bytes_reserved);
btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
if (qgroup_free)
btrfs_qgroup_free_meta_prealloc(node->root,
node->bytes_reserved);
@ -805,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *delayed_item)
{
struct extent_buffer *leaf;
unsigned int nofs_flag;
char *ptr;
int ret;
nofs_flag = memalloc_nofs_save();
ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
delayed_item->data_len);
memalloc_nofs_restore(nofs_flag);
if (ret < 0 && ret != -EEXIST)
return ret;
@ -937,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_item *curr, *prev;
unsigned int nofs_flag;
int ret = 0;
do_again:
@ -945,7 +948,9 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
if (!curr)
goto delete_fail;
nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto delete_fail;
else if (ret > 0) {
@ -1012,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
unsigned int nofs_flag;
int mod;
int ret;
@ -1024,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
else
mod = 1;
nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
memalloc_nofs_restore(nofs_flag);
if (ret > 0) {
btrfs_release_path(path);
return -ENOENT;
@ -1075,7 +1083,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto err_out;
ASSERT(ret);
@ -1139,7 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
int ret = 0;
bool count = (nr > 0);
if (trans->aborted)
if (TRANS_ABORTED(trans))
return -EIO;
path = btrfs_alloc_path();
@ -1760,6 +1771,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
@ -1779,6 +1791,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));

View File

@ -70,7 +70,7 @@ struct btrfs_delayed_item {
refcount_t refs;
int ins_or_del;
u32 data_len;
char data[0];
char data[];
};
static inline void btrfs_init_delayed_root(

View File

@ -82,8 +82,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
NULL);
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, released, 0);

View File

@ -22,6 +22,46 @@
#include "dev-replace.h"
#include "sysfs.h"
/*
* Device replace overview
*
* [Objective]
* To copy all extents (both new and on-disk) from source device to target
* device, while still keeping the filesystem read-write.
*
* [Method]
* There are two main methods involved:
*
* - Write duplication
*
* All new writes will be written to both target and source devices, so even
* if replace gets canceled, sources device still contans up-to-date data.
*
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
* Start: btrfs_dev_replace_start()
* End: btrfs_dev_replace_finishing()
* Content: Latest data/metadata
*
* - Copy existing extents
*
* This happens by re-using scrub facility, as scrub also iterates through
* existing extents from commit root.
*
* Location: scrub_write_block_to_dev_replace() from
* scrub_block_complete()
* Content: Data/meta from commit root.
*
* Due to the content difference, we need to avoid nocow write when dev-replace
* is happening. This is done by marking the block group read-only and waiting
* for NOCOW writes.
*
* After replace is done, the finishing part is done by swapping the target and
* source devices.
*
* Location: btrfs_dev_replace_update_device_in_mapping_tree() from
* btrfs_dev_replace_finishing()
*/
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
static void btrfs_dev_replace_update_device_in_mapping_tree(
@ -472,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@ -703,7 +743,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
btrfs_sysfs_update_devid(tgt_device);
btrfs_rm_dev_replace_free_srcdev(src_device);

File diff suppressed because it is too large Load Diff

View File

@ -39,6 +39,8 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@ -54,15 +56,12 @@ int __cold open_ctree(struct super_block *sb,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
struct buffer_head **bh_ret);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
int copy_num);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
u64 root_id);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
@ -70,19 +69,13 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
bool check_ref);
static inline struct btrfs_root *
btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location)
{
return btrfs_get_fs_root(fs_info, location, true);
}
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_root(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
@ -95,19 +88,16 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
* If you want to ensure the whole tree is safe, you should use
* fs_info->subvol_srcu
*/
static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
if (!root)
return NULL;
if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
static inline void btrfs_put_fs_root(struct btrfs_root *root)
{
if (refcount_dec_and_test(&root->refs))
kfree(root);
}
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);

View File

@ -57,16 +57,14 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation)
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
struct btrfs_key key;
int index;
int err = 0;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
@ -75,25 +73,18 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto fail;
}
root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(root))
return ERR_CAST(root);
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(sb, &key, root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail;
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
btrfs_put_root(root);
if (IS_ERR(inode))
return ERR_CAST(inode);
if (check_generation && generation != inode->i_generation) {
iput(inode);
@ -101,9 +92,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
}
return d_obtain_alias(inode);
fail:
srcu_read_unlock(&fs_info->subvol_srcu, index);
return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@ -152,7 +140,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_get_parent(struct dentry *child)
struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = d_inode(child);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);

View File

@ -18,4 +18,9 @@ struct btrfs_fid {
u64 parent_root_objectid;
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
#endif

View File

@ -36,13 +36,14 @@ struct io_failure_record;
#define CHUNK_TRIMMED EXTENT_DEFRAG
enum {
IO_TREE_FS_INFO_FREED_EXTENTS0,
IO_TREE_FS_INFO_FREED_EXTENTS1,
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_SELFTEST,
};
@ -222,6 +223,8 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,

View File

@ -64,10 +64,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
set_extent_bits(&fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE);
set_extent_bits(&fs_info->freed_extents[1],
start, end, EXTENT_UPTODATE);
set_extent_bits(&fs_info->excluded_extents, start, end,
EXTENT_UPTODATE);
return 0;
}
@ -79,10 +77,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
start = cache->start;
end = start + cache->length - 1;
clear_extent_bits(&fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE);
clear_extent_bits(&fs_info->freed_extents[1],
start, end, EXTENT_UPTODATE);
clear_extent_bits(&fs_info->excluded_extents, start, end,
EXTENT_UPTODATE);
}
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
@ -1193,24 +1189,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
static int insert_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add)
{
int ret;
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
BUG_ON(refs_to_add != 1);
ret = insert_tree_block_ref(trans, path, bytenr, parent,
root_objectid);
} else {
ret = insert_extent_data_ref(trans, path, bytenr, parent,
root_objectid, owner, offset,
refs_to_add);
}
return ret;
}
static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
@ -1469,7 +1447,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
@ -1494,11 +1471,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
BUG_ON(refs_to_add != 1);
ret = insert_tree_block_ref(trans, path, bytenr, parent,
root_objectid);
} else {
ret = insert_extent_data_ref(trans, path, bytenr, parent,
root_objectid, owner, offset,
refs_to_add);
}
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@ -1583,7 +1566,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int err = 0;
int metadata = !extent_op->is_data;
if (trans->aborted)
if (TRANS_ABORTED(trans))
return 0;
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
@ -1604,7 +1587,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
@ -1703,10 +1685,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
if (trans->aborted) {
if (TRANS_ABORTED(trans)) {
if (insert_reserved)
btrfs_pin_extent(trans->fs_info, node->bytenr,
node->num_bytes, 1);
btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return 0;
}
@ -1721,8 +1702,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
else
BUG();
if (ret && insert_reserved)
btrfs_pin_extent(trans->fs_info, node->bytenr,
node->num_bytes, 1);
btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return ret;
}
@ -1867,8 +1847,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, fs_info->csum_root,
head->bytenr, head->num_bytes);
@ -2191,7 +2170,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
if (TRANS_ABORTED(trans))
return 0;
if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
@ -2238,7 +2217,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 flags,
struct extent_buffer *eb, u64 flags,
int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
@ -2254,7 +2233,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@ -2588,7 +2567,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
return bytenr;
}
static int pin_down_extent(struct btrfs_block_group *cache,
static int pin_down_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@ -2607,22 +2587,20 @@ static int pin_down_extent(struct btrfs_block_group *cache,
percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
set_extent_dirty(fs_info->pinned_extents, bytenr,
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
int btrfs_pin_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_block_group *cache;
ASSERT(fs_info->running_transaction);
cache = btrfs_lookup_block_group(fs_info, bytenr);
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
pin_down_extent(cache, bytenr, num_bytes, reserved);
pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@ -2631,13 +2609,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
/*
* this function must be called within transaction
*/
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
int ret;
cache = btrfs_lookup_block_group(fs_info, bytenr);
btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@ -2649,7 +2629,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
*/
btrfs_cache_block_group(cache, 1);
pin_down_extent(cache, bytenr, num_bytes, 0);
pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@ -2763,11 +2743,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
}
}
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
fs_info->pinned_extents = &fs_info->freed_extents[1];
else
fs_info->pinned_extents = &fs_info->freed_extents[0];
up_write(&fs_info->commit_root_sem);
btrfs_update_global_block_rsv(fs_info);
@ -2908,12 +2883,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 end;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
unpin = &trans->transaction->pinned_extents;
while (!trans->aborted) {
while (!TRANS_ABORTED(trans)) {
struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
@ -2923,6 +2895,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
clear_extent_bits(&fs_info->excluded_extents, start,
end, EXTENT_UPTODATE);
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
@ -2950,7 +2925,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 trimmed = 0;
ret = -EROFS;
if (!trans->aborted)
if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
block_group->start,
block_group->length,
@ -3000,7 +2975,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@ -3301,7 +3275,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(cache, buf->start, buf->len, 1);
pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@ -3345,7 +3319,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
(ref->type == BTRFS_REF_DATA &&
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
@ -3438,6 +3412,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
};
/*
* Structure used internally for find_free_extent() function. Wraps needed
* parameters.
@ -3454,6 +3432,8 @@ struct find_free_extent_ctl {
/* For clustered allocation */
u64 empty_cluster;
struct btrfs_free_cluster *last_ptr;
bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
@ -3489,6 +3469,12 @@ struct find_free_extent_ctl {
/* Found result */
u64 found_offset;
/* Hint where to start looking for an empty space */
u64 hint_byte;
/* Allocation policy */
enum btrfs_extent_allocation_policy policy;
};
@ -3501,11 +3487,11 @@ struct find_free_extent_ctl {
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
static int find_free_extent_clustered(struct btrfs_block_group *bg,
struct btrfs_free_cluster *last_ptr,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **cluster_bg_ret)
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **cluster_bg_ret)
{
struct btrfs_block_group *cluster_bg;
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 aligned_cluster;
u64 offset;
int ret;
@ -3605,9 +3591,9 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
struct btrfs_free_cluster *last_ptr,
struct find_free_extent_ctl *ffe_ctl)
struct find_free_extent_ctl *ffe_ctl)
{
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 offset;
/*
@ -3663,16 +3649,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
return 0;
}
static int do_allocation_clustered(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
{
int ret;
/* We want to try and use the cluster allocator, so lets look there */
if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
if (ret >= 0 || ret == -EAGAIN)
return ret;
/* ret == -ENOENT case falls through */
}
return find_free_extent_unclustered(block_group, ffe_ctl);
}
static int do_allocation(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
default:
BUG();
}
}
static void release_block_group(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
int delalloc)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
ffe_ctl->retry_clustered = false;
ffe_ctl->retry_unclustered = false;
break;
default:
BUG();
}
BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
ffe_ctl->index);
btrfs_release_block_group(block_group, delalloc);
}
static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
struct btrfs_key *ins)
{
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
if (!ffe_ctl->use_cluster && last_ptr) {
spin_lock(&last_ptr->lock);
last_ptr->window_start = ins->objectid;
spin_unlock(&last_ptr->lock);
}
}
static void found_extent(struct find_free_extent_ctl *ffe_ctl,
struct btrfs_key *ins)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
found_extent_clustered(ffe_ctl, ins);
break;
default:
BUG();
}
}
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
/*
* If we can't allocate a new chunk we've already looped through
* at least once, move on to the NO_EMPTY_SIZE case.
*/
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
return 0;
default:
BUG();
}
}
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
* Return <0 means we failed to locate any free extent.
*/
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_free_cluster *last_ptr,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
int full_search, bool use_cluster)
bool full_search)
{
struct btrfs_root *root = fs_info->extent_root;
int ret;
@ -3689,11 +3760,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return 1;
if (ins->objectid) {
if (!use_cluster && last_ptr) {
spin_lock(&last_ptr->lock);
last_ptr->window_start = ins->objectid;
spin_unlock(&last_ptr->lock);
}
found_extent(ffe_ctl, ins);
return 0;
}
@ -3739,16 +3806,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
CHUNK_ALLOC_FORCE);
/*
* If we can't allocate a new chunk we've already looped
* through at least once, move on to the NO_EMPTY_SIZE
* case.
*/
if (ret == -ENOSPC)
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
/* Do not bail out on ENOSPC since we can do more. */
if (ret < 0 && ret != -ENOSPC)
if (ret == -ENOSPC)
ret = chunk_allocation_failed(ffe_ctl);
else if (ret < 0)
btrfs_abort_transaction(trans, ret);
else
ret = 0;
@ -3759,6 +3820,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
return -ENOSPC;
/*
* Don't loop again if we already have no empty_size and
* no empty_cluster.
@ -3774,6 +3838,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
struct btrfs_key *ins)
{
/*
* If our free space is heavily fragmented we may not be able to make
* big contiguous allocations, so instead of doing the expensive search
* for free space, simply return ENOSPC with our max_extent_size so we
* can go ahead and search for a more manageable chunk.
*
* If our max_extent_size is large enough for our allocation simply
* disable clustering since we will likely not be able to find enough
* space to create a cluster and induce latency trying.
*/
if (space_info->max_extent_size) {
spin_lock(&space_info->lock);
if (space_info->max_extent_size &&
ffe_ctl->num_bytes > space_info->max_extent_size) {
ins->offset = space_info->max_extent_size;
spin_unlock(&space_info->lock);
return -ENOSPC;
} else if (space_info->max_extent_size) {
ffe_ctl->use_cluster = false;
}
spin_unlock(&space_info->lock);
}
ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
&ffe_ctl->empty_cluster);
if (ffe_ctl->last_ptr) {
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
ffe_ctl->hint_byte = last_ptr->window_start;
if (last_ptr->fragmented) {
/*
* We still set window_start so we can keep track of the
* last place we found an allocation to try and save
* some time.
*/
ffe_ctl->hint_byte = last_ptr->window_start;
ffe_ctl->use_cluster = false;
}
spin_unlock(&last_ptr->lock);
}
return 0;
}
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
struct btrfs_key *ins)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
default:
BUG();
}
}
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@ -3801,16 +3930,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
*/
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool use_cluster = true;
bool full_search = false;
WARN_ON(num_bytes < fs_info->sectorsize);
@ -3819,13 +3946,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
ffe_ctl.search_start = 0;
ffe_ctl.retry_clustered = false;
ffe_ctl.retry_unclustered = false;
ffe_ctl.delalloc = delalloc;
ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
ffe_ctl.have_caching_bg = false;
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
ffe_ctl.hint_byte = hint_byte_orig;
ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
/* For clustered allocation */
ffe_ctl.retry_clustered = false;
ffe_ctl.retry_unclustered = false;
ffe_ctl.last_ptr = NULL;
ffe_ctl.use_cluster = true;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
@ -3839,51 +3972,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
/*
* If our free space is heavily fragmented we may not be able to make
* big contiguous allocations, so instead of doing the expensive search
* for free space, simply return ENOSPC with our max_extent_size so we
* can go ahead and search for a more manageable chunk.
*
* If our max_extent_size is large enough for our allocation simply
* disable clustering since we will likely not be able to find enough
* space to create a cluster and induce latency trying.
*/
if (unlikely(space_info->max_extent_size)) {
spin_lock(&space_info->lock);
if (space_info->max_extent_size &&
num_bytes > space_info->max_extent_size) {
ins->offset = space_info->max_extent_size;
spin_unlock(&space_info->lock);
return -ENOSPC;
} else if (space_info->max_extent_size) {
use_cluster = false;
}
spin_unlock(&space_info->lock);
}
last_ptr = fetch_cluster_info(fs_info, space_info,
&ffe_ctl.empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
if (last_ptr->fragmented) {
/*
* We still set window_start so we can keep track of the
* last place we found an allocation to try and save
* some time.
*/
hint_byte = last_ptr->window_start;
use_cluster = false;
}
spin_unlock(&last_ptr->lock);
}
ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
if (ret < 0)
return ret;
ffe_ctl.search_start = max(ffe_ctl.search_start,
first_logical_byte(fs_info, 0));
ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
if (ffe_ctl.search_start == hint_byte) {
ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
ffe_ctl.search_start);
/*
@ -3924,6 +4020,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
&space_info->block_groups[ffe_ctl.index], list) {
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro))
continue;
@ -3984,39 +4082,20 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
/*
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
if (last_ptr && use_cluster) {
struct btrfs_block_group *cluster_bg = NULL;
ret = find_free_extent_clustered(block_group, last_ptr,
&ffe_ctl, &cluster_bg);
if (ret == 0) {
if (cluster_bg && cluster_bg != block_group) {
btrfs_release_block_group(block_group,
delalloc);
block_group = cluster_bg;
}
goto checks;
} else if (ret == -EAGAIN) {
goto have_block_group;
} else if (ret > 0) {
goto loop;
bg_ret = NULL;
ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
if (ret == 0) {
if (bg_ret && bg_ret != block_group) {
btrfs_release_block_group(block_group, delalloc);
block_group = bg_ret;
}
/* ret == -ENOENT case falls through */
} else if (ret == -EAGAIN) {
goto have_block_group;
} else if (ret > 0) {
goto loop;
}
ret = find_free_extent_unclustered(block_group, last_ptr,
&ffe_ctl);
if (ret == -EAGAIN)
goto have_block_group;
else if (ret > 0)
goto loop;
/* ret == 0 case falls through */
checks:
/* Checks */
ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
fs_info->stripesize);
@ -4050,17 +4129,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_release_block_group(block_group, delalloc);
break;
loop:
ffe_ctl.retry_clustered = false;
ffe_ctl.retry_unclustered = false;
BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
ffe_ctl.index);
btrfs_release_block_group(block_group, delalloc);
release_block_group(block_group, &ffe_ctl, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
full_search, use_cluster);
ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
if (ret > 0)
goto search;
@ -4189,18 +4263,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len)
{
struct btrfs_block_group *cache;
int ret = 0;
cache = btrfs_lookup_block_group(fs_info, start);
cache = btrfs_lookup_block_group(trans->fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
btrfs_err(trans->fs_info, "unable to find block group for %llu",
start);
return -ENOSPC;
}
ret = pin_down_extent(cache, start, len, 1);
ret = pin_down_extent(trans, cache, start, len, 1);
btrfs_put_block_group(cache);
return ret;
}
@ -4431,7 +4507,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
if (ret)
btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
@ -4750,8 +4826,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, eb->start,
eb->len, flag,
ret = btrfs_set_disk_extent_flags(trans, eb, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
@ -5209,9 +5284,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*
* If called with for_reloc == 0, may exit early with -EAGAIN
*/
int btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int update_ref,
int for_reloc)
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@ -5250,9 +5323,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (err)
goto out_end_trans;
if (block_rsv)
trans->block_rsv = block_rsv;
/*
* This will help us catch people modifying the fs tree while we're
* dropping it. It is unsafe to mess with the fs tree while it's being
@ -5380,8 +5450,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
err = PTR_ERR(trans);
goto out_free;
}
if (block_rsv)
trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
@ -5413,13 +5481,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
btrfs_put_fs_root(root);
}
else
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans);
@ -5749,47 +5814,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
return bg_ret;
return dev_ret;
}
/*
* btrfs_{start,end}_write_no_snapshotting() are similar to
* mnt_{want,drop}_write(), they are used to prevent some tasks from writing
* data into the page cache through nocow before the subvolume is snapshoted,
* but flush the data into disk after the snapshot creation, or to prevent
* operations while snapshotting is ongoing and that cause the snapshot to be
* inconsistent (writes followed by expanding truncates for example).
*/
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
cond_wake_up(&root->subv_writers->wait);
}
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
{
if (atomic_read(&root->will_be_snapshotted))
return 0;
percpu_counter_inc(&root->subv_writers->counter);
/*
* Make sure counter is updated before we check for snapshot creation.
*/
smp_mb();
if (atomic_read(&root->will_be_snapshotted)) {
btrfs_end_write_no_snapshotting(root);
return 0;
}
return 1;
}
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
{
while (true) {
int ret;
ret = btrfs_start_write_no_snapshotting(root);
if (ret)
break;
wait_var_event(&root->will_be_snapshotted,
!atomic_read(&root->will_be_snapshotted));
}
}

View File

@ -35,42 +35,54 @@ static inline bool extent_state_in_tree(const struct extent_state *state)
}
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(buffers);
static LIST_HEAD(states);
static DEFINE_SPINLOCK(leak_lock);
static inline
void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
static inline void btrfs_leak_debug_add(spinlock_t *lock,
struct list_head *new,
struct list_head *head)
{
unsigned long flags;
spin_lock_irqsave(&leak_lock, flags);
spin_lock_irqsave(lock, flags);
list_add(new, head);
spin_unlock_irqrestore(&leak_lock, flags);
spin_unlock_irqrestore(lock, flags);
}
static inline
void btrfs_leak_debug_del(struct list_head *entry)
static inline void btrfs_leak_debug_del(spinlock_t *lock,
struct list_head *entry)
{
unsigned long flags;
spin_lock_irqsave(&leak_lock, flags);
spin_lock_irqsave(lock, flags);
list_del(entry);
spin_unlock_irqrestore(&leak_lock, flags);
spin_unlock_irqrestore(lock, flags);
}
static inline void btrfs_extent_buffer_leak_debug_check(void)
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
unsigned long flags;
while (!list_empty(&buffers)) {
eb = list_entry(buffers.next, struct extent_buffer, leak_list);
pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
/*
* If we didn't get into open_ctree our allocated_ebs will not be
* initialized, so just skip this.
*/
if (!fs_info->allocated_ebs.next)
return;
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
pr_err(
"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
static inline void btrfs_extent_state_leak_debug_check(void)
@ -107,9 +119,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
}
}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
#define btrfs_leak_debug_del(entry) do {} while (0)
#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
#define btrfs_leak_debug_del(lock, entry) do {} while (0)
#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
@ -122,7 +133,6 @@ struct tree_entry {
struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@ -246,8 +256,6 @@ void __cold extent_state_cache_exit(void)
void __cold extent_io_exit(void)
{
btrfs_extent_buffer_leak_debug_check();
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
@ -257,6 +265,15 @@ void __cold extent_io_exit(void)
bioset_exit(&btrfs_bioset);
}
/*
* For the file_extent_tree, we want to hold the inode lock when we lookup and
* update the disk_i_size, but lockdep will complain because our io_tree we hold
* the tree lock and get the inode lock when setting delalloc. These two things
* are unrelated, so make a class for the file_extent_tree so we don't get the
* two locking patterns mixed up.
*/
static struct lock_class_key file_extent_tree_class;
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
@ -268,6 +285,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
if (owner == IO_TREE_INODE_FILE_EXTENT)
lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
@ -314,7 +333,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->state = 0;
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
@ -327,7 +346,7 @@ void free_extent_state(struct extent_state *state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
btrfs_leak_debug_del(&leak_lock, &state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
@ -1053,6 +1072,16 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
goto out;
}
/*
* If this extent already has all the bits we want set, then
* skip it, not necessary to split it or do anything with it.
*/
if ((state->state & bits) == bits) {
start = state->end + 1;
cache_state(state, cached_state);
goto search_again;
}
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
@ -1567,6 +1596,43 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
return ret;
}
/**
* find_contiguous_extent_bit: find a contiguous area of bits
* @tree - io tree to check
* @start - offset to start the search from
* @start_ret - the first offset we found with the bits set
* @end_ret - the final contiguous range of the bits that were set
* @bits - bits to look for
*
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
* to set bits appropriately, and then merge them again. During this time it
* will drop the tree->lock, so use this helper if you want to find the actual
* contiguous area for given bits. We will search to the first bit we find, and
* then walk down the tree until we find a non-contiguous area. The area
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits)
{
struct extent_state *state;
int ret = 1;
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
*start_ret = state->start;
*end_ret = state->end;
while ((state = next_state(state)) != NULL) {
if (state->start > (*end_ret + 1))
break;
*end_ret = state->end;
}
ret = 0;
}
spin_unlock(&tree->lock);
return ret;
}
/**
* find_first_clear_extent_bit - find the first range that has @bits not set.
* This range could start before @start.
@ -2926,7 +2992,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
* @pg_offset: offset of the new bio or to check whether we are adding
@ -2939,7 +3004,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*/
static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
@ -2954,6 +3019,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct bio *bio;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
ASSERT(bio_ret);
@ -3062,8 +3128,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
static int __do_readpage(struct extent_io_tree *tree,
struct page *page,
static int __do_readpage(struct page *page,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
@ -3086,6 +3151,7 @@ static int __do_readpage(struct extent_io_tree *tree,
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
set_page_extent_mapped(page);
@ -3242,7 +3308,7 @@ static int __do_readpage(struct extent_io_tree *tree,
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
end_bio_extent_readpage, mirror_num,
@ -3269,8 +3335,7 @@ static int __do_readpage(struct extent_io_tree *tree,
return ret;
}
static inline void contiguous_readpages(struct extent_io_tree *tree,
struct page *pages[], int nr_pages,
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
struct bio **bio,
@ -3280,17 +3345,16 @@ static inline void contiguous_readpages(struct extent_io_tree *tree,
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
__do_readpage(pages[index], btrfs_get_extent, em_cached,
bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
static int __extent_read_full_page(struct page *page,
get_extent_t *get_extent,
struct bio **bio, int mirror_num,
unsigned long *bio_flags,
@ -3301,21 +3365,21 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
int ret;
btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
bio_flags, read_flags, NULL);
return ret;
}
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num)
int extent_read_full_page(struct page *page, get_extent_t *get_extent,
int mirror_num)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
int ret;
ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
&bio_flags, 0);
if (bio)
ret = submit_one_bio(bio, mirror_num, bio_flags);
@ -3423,7 +3487,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
unsigned long nr_written,
int *nr_ret)
{
struct extent_io_tree *tree = epd->tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@ -3509,7 +3573,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page->index, cur, end);
}
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
page, offset, iosize, pg_offset,
&epd->bio,
end_bio_extent_writepage,
@ -3830,8 +3894,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
int i, num_pages;
@ -3864,7 +3926,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
@ -3897,14 +3959,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
struct extent_buffer *eb, *prev_eb = NULL;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@ -4018,7 +4079,39 @@ int btree_write_cache_pages(struct address_space *mapping,
end_write_bio(&epd, ret);
return ret;
}
ret = flush_write_bio(&epd);
/*
* If something went wrong, don't allow any metadata write bio to be
* submitted.
*
* This would prevent use-after-free if we had dirty pages not
* cleaned up, which can still happen by fuzzed images.
*
* - Bad extent tree
* Allowing existing tree block to be allocated for other trees.
*
* - Log tree operations
* Exiting tree blocks get allocated to log tree, bumps its
* generation, then get cleaned in tree re-balance.
* Such tree block will not be written back, since it's clean,
* thus no WRITTEN flag set.
* And after log writes back, this tree block is not traced by
* any dirty extent_io_tree.
*
* - Offending tree block gets re-dirtied from its original owner
* Since it has bumped generation, no WRITTEN flag, it can be
* reused without COWing. This tree block will not be traced
* by btrfs_transaction::dirty_pages.
*
* Now such dirty tree block will not be cleaned by any dirty
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*/
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
ret = -EUCLEAN;
end_write_bio(&epd, ret);
}
return ret;
}
@ -4190,7 +4283,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
int ret;
struct extent_page_data epd = {
.bio = NULL,
.tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@ -4212,14 +4304,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@ -4263,7 +4353,6 @@ int extent_writepages(struct address_space *mapping,
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
.tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@ -4285,7 +4374,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned long bio_flags = 0;
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
@ -4312,7 +4400,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
contiguous_readpages(tree, pagepool, nr, contig_start,
contiguous_readpages(pagepool, nr, contig_start,
contig_end, &em_cached, &bio, &bio_flags,
&prev_em_start);
}
@ -4796,7 +4884,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
static void __free_extent_buffer(struct extent_buffer *eb)
{
btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
@ -4862,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
__free_extent_buffer(eb);
}
@ -4883,7 +4971,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
btrfs_leak_debug_add(&eb->leak_list, &buffers);
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
@ -5230,6 +5319,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
}
static int release_extent_buffer(struct extent_buffer *eb)
__releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
@ -5248,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@ -5405,7 +5496,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@ -5453,7 +5543,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
err = __extent_read_full_page(tree, page,
err = __extent_read_full_page(page,
btree_get_extent, &bio,
mirror_num, &bio_flags,
REQ_META);

View File

@ -189,8 +189,8 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
int extent_read_full_page(struct page *page, get_extent_t *get_extent,
int mirror_num);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@ -325,4 +325,11 @@ bool find_lock_delalloc_range(struct inode *inode,
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
#ifdef CONFIG_BTRFS_DEBUG
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
#else
#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
#endif
#endif

View File

@ -23,6 +23,97 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
/**
* @inode - the inode we want to update the disk_i_size for
* @new_i_size - the i_size we want to set to, 0 if we use i_size
*
* With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
* returns as it is perfectly fine with a file that has holes without hole file
* extent items.
*
* However without NO_HOLES we need to only return the area that is contiguous
* from the 0 offset of the file. Otherwise we could end up adjust i_size up
* to an extent that has a gap in between.
*
* Finally new_i_size should only be set in the case of truncate where we're not
* ready to use i_size_read() as the limiter yet.
*/
void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 start, end, i_size;
int ret;
i_size = new_i_size ?: i_size_read(inode);
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
BTRFS_I(inode)->disk_i_size = i_size;
return;
}
spin_lock(&BTRFS_I(inode)->lock);
ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0,
&start, &end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
else
i_size = 0;
BTRFS_I(inode)->disk_i_size = i_size;
spin_unlock(&BTRFS_I(inode)->lock);
}
/**
* @inode - the inode we're modifying
* @start - the start file offset of the file extent we've inserted
* @len - the logical length of the file extent item
*
* Call when we are inserting a new file extent where there was none before.
* Does not need to call this in the case where we're replacing an existing file
* extent, however if not sure it's fine to call this multiple times.
*
* The start and len must match the file extent item, so thus must be sectorsize
* aligned.
*/
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY);
}
/**
* @inode - the inode we're modifying
* @start - the start file offset of the file extent we've inserted
* @len - the logical length of the file extent item
*
* Called when we drop a file extent, for example when we truncate. Doesn't
* need to be called for cases where we're replacing a file extent, like when
* we've COWed a file extent.
*
* The start and len must match the file extent item, so thus must be sectorsize
* aligned.
*/
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return clear_extent_bit(&inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
}
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
{
@ -949,18 +1040,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, fi);
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, fi);
extent_end = ALIGN(extent_start + size,
fs_info->sectorsize);
}
extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
@ -1007,3 +1087,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
root->root_key.objectid);
}
}
/*
* Returns the end offset (non inclusive) of the file extent item the given path
* points to. If it points to an inline extent, the returned offset is rounded
* up to the sector size.
*/
u64 btrfs_file_extent_end(const struct btrfs_path *path)
{
const struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 end;
btrfs_item_key_to_cpu(leaf, &key, slot);
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
end = btrfs_file_extent_ram_bytes(leaf, fi);
end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
} else {
end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
}
return end;
}

View File

@ -27,6 +27,7 @@
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
#include "reflink.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@ -277,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
int index;
int ret;
/* get the inode */
@ -285,9 +285,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
inode_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
@ -297,11 +295,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root);
btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@ -337,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
iput(inode);
return 0;
cleanup:
srcu_read_unlock(&fs_info->subvol_srcu, index);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
@ -1552,15 +1549,14 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
ret = btrfs_start_write_no_snapshotting(root);
if (!ret)
if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
btrfs_lock_and_flush_ordered_range(inode, lockstart,
lockend, NULL);
num_bytes = lockend - lockstart + 1;
@ -1568,7 +1564,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
@ -1674,7 +1670,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
data_reserved, pos,
write_bytes);
else
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
break;
}
@ -1778,7 +1774,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
release_bytes = 0;
if (only_release_metadata)
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@ -1807,7 +1803,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (release_bytes) {
if (only_release_metadata) {
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
@ -2070,6 +2066,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_init_log_ctx(&ctx, inode);
/*
* Set the range to full if the NO_HOLES feature is not enabled.
* This is to avoid missing file extent items representing holes after
* replaying the log.
*/
if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
start = 0;
end = LLONG_MAX;
}
/*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
@ -2091,19 +2097,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
* If the inode needs a full sync, make sure we use a full range to
* avoid log tree corruption, due to hole detection racing with ordered
* extent completion for adjacent ranges, and assertion failures during
* hole detection. Do this while holding the inode lock, to avoid races
* with other tasks.
*/
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags)) {
start = 0;
end = LLONG_MAX;
}
/*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
@ -2124,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@ -2486,6 +2480,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
clone_info->file_offset, clone_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
if (clone_info->disk_offset == 0)
return 0;
@ -2596,6 +2595,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
} else if (!clone_info && cur_offset < drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
* know to not set disk_i_size in this area until a new
* file extent is inserted here.
*/
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
if (ret) {
/*
* We couldn't clear our area, so we could
* presumably adjust up and corrupt the fs, so
* we need to abort.
*/
btrfs_abort_transaction(trans, ret);
break;
}
}
if (clone_info && drop_end > clone_info->file_offset) {
@ -2686,6 +2703,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
} else if (!clone_info && cur_offset < drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
}
if (clone_info) {
ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
@ -2935,7 +2961,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
btrfs_ordered_update_i_size(inode, end, NULL);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode(trans, root, inode);
ret2 = btrfs_end_transaction(trans);

View File

@ -371,10 +371,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
}
}
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int uptodate)
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
struct page *page;
struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
readahead_cache(inode);
ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
ret = io_ctl_prepare_pages(&io_ctl, true);
if (ret)
goto out;
@ -1067,6 +1067,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
}
static noinline_for_stack int write_pinned_extent_entries(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
@ -1085,7 +1086,7 @@ static noinline_for_stack int write_pinned_extent_entries(
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
unpin = block_group->fs_info->pinned_extents;
unpin = &trans->transaction->pinned_extents;
start = block_group->start;
@ -1190,7 +1191,7 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
if (block_group) {
#ifdef DEBUG
#ifdef CONFIG_BTRFS_DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
block_group->start);
@ -1291,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}
/* Lock all pages first so we can lock the extent safely. */
ret = io_ctl_prepare_pages(io_ctl, inode, 0);
ret = io_ctl_prepare_pages(io_ctl, false);
if (ret)
goto out_unlock;
@ -1317,7 +1318,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
if (ret)
goto out_nospc_locked;
@ -1366,18 +1367,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
return 0;
out:
io_ctl->inode = NULL;
io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
if (must_iput)
iput(inode);
return ret;
out_nospc_locked:
cleanup_bitmap_list(&bitmap_list);
spin_unlock(&ctl->tree_lock);
@ -1390,7 +1379,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
goto out;
out:
io_ctl->inode = NULL;
io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
if (must_iput)
iput(inode);
return ret;
}
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
@ -1416,7 +1415,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
block_group, &block_group->io_ctl, trans);
if (ret) {
#ifdef DEBUG
#ifdef CONFIG_BTRFS_DEBUG
btrfs_err(fs_info,
"failed to write free space cache for block group %llu",
block_group->start);
@ -4036,7 +4035,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
if (release_metadata)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
inode->i_size, true);
#ifdef DEBUG
#ifdef CONFIG_BTRFS_DEBUG
btrfs_err(fs_info,
"failed to write free ino cache for root %llu",
root->root_key.objectid);

View File

@ -1251,9 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
0, 1);
free_extent_buffer(free_space_root->node);
free_extent_buffer(free_space_root->commit_root);
kfree(free_space_root);
btrfs_put_root(free_space_root);
return btrfs_commit_transaction(trans);

View File

@ -515,7 +515,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved);
trans->bytes_reserved, NULL);
out:
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;

View File

@ -28,6 +28,7 @@
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "misc.h"
@ -241,6 +242,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
/*
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
size = ALIGN(size, root->fs_info->sectorsize);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
if (ret)
goto fail;
/*
* we're an inline extent, so nobody can
* extend the file past i_size without locking
@ -2446,6 +2456,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
ram_bytes);
if (ret)
goto out;
/*
* Release the reserved range from inode dirty range map, as it is
* already moved into delayed_ref_head
@ -2536,7 +2551,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
*/
btrfs_qgroup_free_data(inode, NULL, start,
ordered_extent->num_bytes);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@ -2607,7 +2622,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
@ -3187,6 +3202,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
@ -4158,6 +4175,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
}
while (1) {
u64 clear_start = 0, clear_len = 0;
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@ -4208,6 +4227,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
clear_start = found_key.offset;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
if (!del_item) {
u64 orig_num_bytes =
@ -4215,6 +4236,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
extent_num_bytes = ALIGN(new_size -
found_key.offset,
fs_info->sectorsize);
clear_start = ALIGN(new_size, fs_info->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
@ -4240,6 +4262,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
inode_sub_bytes(inode, num_dec);
}
}
clear_len = num_dec;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
/*
* we can't truncate inline items that have had
@ -4261,12 +4284,33 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
*/
ret = NEED_TRUNCATE_BLOCK;
break;
} else {
/*
* Inline extents are special, we just treat
* them as a full sector worth in the file
* extent tree just for simplicity sake.
*/
clear_len = fs_info->sectorsize;
}
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 - new_size);
}
delete:
/*
* We use btrfs_truncate_inode_items() to clean up log trees for
* multiple fsyncs, and in this case we don't want to clear the
* file extent range because it's just the log.
*/
if (root == BTRFS_I(inode)->root) {
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
clear_start, clear_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
}
if (del_item)
last_size = found_key.offset;
else
@ -4368,7 +4412,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
ASSERT(last_size >= new_size);
if (!ret && last_size > new_size)
last_size = new_size;
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_inode_safe_disk_i_size_write(inode, last_size);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
(u64)-1, &cached_state);
}
@ -4576,7 +4620,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
block_end - 1, &cached_state);
cur_offset = hole_start;
while (1) {
@ -4589,14 +4633,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
}
last_byte = min(extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
err = maybe_insert_hole(root, inode, cur_offset,
hole_size);
if (err)
break;
err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
cur_offset, hole_size);
if (err)
break;
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
@ -4628,6 +4679,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_size - 1, 0);
}
free_extent_map(hole_em);
} else {
err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
cur_offset, hole_size);
if (err)
break;
}
next:
free_extent_map(em);
@ -4671,24 +4727,24 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* truncation, it must capture all writes that happened before
* this truncation.
*/
btrfs_wait_for_snapshot_creation(root);
btrfs_drew_write_lock(&root->snapshot_lock);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret) {
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
return ret;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
return PTR_ERR(trans);
}
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
btrfs_inode_safe_disk_i_size_write(inode, 0);
pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_write_no_snapshotting(root);
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
@ -5098,7 +5154,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
btrfs_release_path(path);
new_root = btrfs_read_fs_root_no_name(fs_info, location);
new_root = btrfs_get_fs_root(fs_info, location, true);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
@ -5179,7 +5235,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
inode->i_ino = args->location->objectid;
memcpy(&BTRFS_I(inode)->location, args->location,
sizeof(*args->location));
BTRFS_I(inode)->root = args->root;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
BUG_ON(args->root && !BTRFS_I(inode)->root);
return 0;
}
@ -5260,7 +5317,7 @@ static struct inode *new_simple_dir(struct super_block *s,
if (!inode)
return ERR_PTR(-ENOMEM);
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->root = btrfs_grab_root(root);
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@ -5307,7 +5364,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
struct btrfs_root *sub_root = root;
struct btrfs_key location;
u8 di_type = 0;
int index;
int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
@ -5334,7 +5390,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return inode;
}
index = srcu_read_lock(&fs_info->subvol_srcu);
ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
if (ret < 0) {
@ -5345,7 +5400,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
} else {
inode = btrfs_iget(dir->i_sb, &location, sub_root);
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
if (root != sub_root)
btrfs_put_root(sub_root);
if (!IS_ERR(inode) && root != sub_root) {
down_read(&fs_info->cleanup_work_sem);
@ -5826,7 +5882,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->dir_index = *index;
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->root = btrfs_grab_root(root);
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@ -6463,6 +6519,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
extent_end = btrfs_file_extent_end(path);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
@ -6473,18 +6530,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
btrfs_ino(inode));
goto out;
}
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
extent_end = ALIGN(extent_start + size,
fs_info->sectorsize);
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
path->slots[0],
extent_start);
@ -8211,9 +8259,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int btrfs_readpage(struct file *file, struct page *page)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
return extent_read_full_page(tree, page, btrfs_get_extent, 0);
return extent_read_full_page(page, btrfs_get_extent, 0);
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@ -8272,6 +8318,39 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
return __btrfs_releasepage(page, gfp_flags);
}
#ifdef CONFIG_MIGRATION
static int btrfs_migratepage(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
int ret;
ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
if (page_has_private(page)) {
ClearPagePrivate(page);
get_page(newpage);
set_page_private(newpage, page_private(page));
set_page_private(page, 0);
put_page(page);
SetPagePrivate(newpage);
}
if (PagePrivate2(page)) {
ClearPagePrivate2(page);
SetPagePrivate2(newpage);
}
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
#endif
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
@ -8647,7 +8726,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
break;
}
btrfs_block_rsv_release(fs_info, rsv, -1);
btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
@ -8672,7 +8751,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
btrfs_inode_safe_disk_i_size_write(inode, 0);
}
if (trans) {
@ -8776,6 +8855,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
extent_io_tree_init(fs_info, &ei->io_failure_tree,
IO_TREE_INODE_IO_FAILURE, inode);
extent_io_tree_init(fs_info, &ei->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT, inode);
ei->io_tree.track_uptodate = true;
ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
@ -8842,6 +8923,8 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
btrfs_put_root(BTRFS_I(inode)->root);
}
int btrfs_drop_inode(struct inode *inode)
@ -9669,14 +9752,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_fs_root(root);
root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->delalloc_root,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
ret = start_delalloc_inodes(root, nr, false);
btrfs_put_fs_root(root);
btrfs_put_root(root);
if (ret < 0)
goto out;
@ -9938,7 +10021,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
else
i_size = cur_offset;
i_size_write(inode, i_size);
btrfs_ordered_update_i_size(inode, i_size, NULL);
btrfs_inode_safe_disk_i_size_write(inode, 0);
}
ret = btrfs_update_inode(trans, root, inode);
@ -10474,6 +10557,9 @@ static const struct address_space_operations btrfs_aops = {
.direct_IO = btrfs_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,

File diff suppressed because it is too large Load Diff

View File

@ -523,3 +523,138 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
path->locks[i] = 0;
}
}
/*
* Loop around taking references on and locking the root node of the tree until
* we end up with a lock on the root node.
*
* Return: root extent buffer with write lock held
*/
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_lock(eb);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/*
* Loop around taking references on and locking the root node of the tree until
* we end up with a lock on the root node.
*
* Return: root extent buffer with read lock held
*/
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/*
* DREW locks
* ==========
*
* DREW stands for double-reader-writer-exclusion lock. It's used in situation
* where you want to provide A-B exclusion but not AA or BB.
*
* Currently implementation gives more priority to reader. If a reader and a
* writer both race to acquire their respective sides of the lock the writer
* would yield its lock as soon as it detects a concurrent reader. Additionally
* if there are pending readers no new writers would be allowed to come in and
* acquire the lock.
*/
int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
{
int ret;
ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
if (ret)
return ret;
atomic_set(&lock->readers, 0);
init_waitqueue_head(&lock->pending_readers);
init_waitqueue_head(&lock->pending_writers);
return 0;
}
void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
{
percpu_counter_destroy(&lock->writers);
}
/* Return true if acquisition is successful, false otherwise */
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
{
if (atomic_read(&lock->readers))
return false;
percpu_counter_inc(&lock->writers);
/* Ensure writers count is updated before we check for pending readers */
smp_mb();
if (atomic_read(&lock->readers)) {
btrfs_drew_write_unlock(lock);
return false;
}
return true;
}
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
{
while (true) {
if (btrfs_drew_try_write_lock(lock))
return;
wait_event(lock->pending_writers, !atomic_read(&lock->readers));
}
}
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
percpu_counter_dec(&lock->writers);
cond_wake_up(&lock->pending_readers);
}
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
{
atomic_inc(&lock->readers);
/*
* Ensure the pending reader count is perceieved BEFORE this reader
* goes to sleep in case of active writers. This guarantees new writers
* won't be allowed and that the current reader will be woken up when
* the last active writer finishes its jobs.
*/
smp_mb__after_atomic();
wait_event(lock->pending_readers,
percpu_counter_sum(&lock->writers) == 0);
}
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
{
/*
* atomic_dec_and_test implies a full barrier, so woken up writers
* are guaranteed to see the decrement
*/
if (atomic_dec_and_test(&lock->readers))
wake_up(&lock->pending_writers);
}

View File

@ -6,6 +6,9 @@
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
#define BTRFS_WRITE_LOCK 1
@ -13,6 +16,8 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
struct btrfs_path;
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
@ -48,4 +53,19 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
struct btrfs_drew_lock {
atomic_t readers;
struct percpu_counter writers;
wait_queue_head_t pending_writers;
wait_queue_head_t pending_readers;
};
int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
#endif

View File

@ -580,7 +580,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
root = btrfs_grab_fs_root(root);
root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
@ -588,7 +588,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
btrfs_put_fs_root(root);
btrfs_put_root(root);
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
@ -785,134 +785,6 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
return entry;
}
/*
* After an extent is done, call this to conditionally update the on disk
* i_size. i_size is updated to cover any fully written part of the file.
*/
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered)
{
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
u64 disk_i_size;
u64 new_i_size;
u64 i_size = i_size_read(inode);
struct rb_node *node;
struct rb_node *prev = NULL;
struct btrfs_ordered_extent *test;
int ret = 1;
u64 orig_offset = offset;
spin_lock_irq(&tree->lock);
if (ordered) {
offset = entry_end(ordered);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
offset = min(offset,
ordered->file_offset +
ordered->truncated_len);
} else {
offset = ALIGN(offset, btrfs_inode_sectorsize(inode));
}
disk_i_size = BTRFS_I(inode)->disk_i_size;
/*
* truncate file.
* If ordered is not NULL, then this is called from endio and
* disk_i_size will be updated by either truncate itself or any
* in-flight IOs which are inside the disk_i_size.
*
* Because btrfs_setsize() may set i_size with disk_i_size if truncate
* fails somehow, we need to make sure we have a precise disk_i_size by
* updating it as usual.
*
*/
if (!ordered && disk_i_size > i_size) {
BTRFS_I(inode)->disk_i_size = orig_offset;
ret = 0;
goto out;
}
/*
* if the disk i_size is already at the inode->i_size, or
* this ordered extent is inside the disk i_size, we're done
*/
if (disk_i_size == i_size)
goto out;
/*
* We still need to update disk_i_size if outstanding_isize is greater
* than disk_i_size.
*/
if (offset <= disk_i_size &&
(!ordered || ordered->outstanding_isize <= disk_i_size))
goto out;
/*
* walk backward from this ordered extent to disk_i_size.
* if we find an ordered extent then we can't update disk i_size
* yet
*/
if (ordered) {
node = rb_prev(&ordered->rb_node);
} else {
prev = tree_search(tree, offset);
/*
* we insert file extents without involving ordered struct,
* so there should be no ordered struct cover this offset
*/
if (prev) {
test = rb_entry(prev, struct btrfs_ordered_extent,
rb_node);
BUG_ON(offset_in_entry(test, offset));
}
node = prev;
}
for (; node; node = rb_prev(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
/* We treat this entry as if it doesn't exist */
if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
continue;
if (entry_end(test) <= disk_i_size)
break;
if (test->file_offset >= i_size)
break;
/*
* We don't update disk_i_size now, so record this undealt
* i_size. Or we will not know the real i_size.
*/
if (test->outstanding_isize < offset)
test->outstanding_isize = offset;
if (ordered &&
ordered->outstanding_isize > test->outstanding_isize)
test->outstanding_isize = ordered->outstanding_isize;
goto out;
}
new_i_size = min_t(u64, offset, i_size);
/*
* Some ordered extents may completed before the current one, and
* we hold the real i_size in ->outstanding_isize.
*/
if (ordered && ordered->outstanding_isize > new_i_size)
new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
BTRFS_I(inode)->disk_i_size = new_i_size;
ret = 0;
out:
/*
* We need to do this because we can't remove ordered extents until
* after the i_disk_size has been updated and then the inode has been
* updated to reflect the change, so we need to tell anybody who finds
* this ordered extent that we've already done all the real work, we
* just haven't completed all the other work.
*/
if (ordered)
set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
spin_unlock_irq(&tree->lock);
return ret;
}
/*
* search the ordered extents for one corresponding to 'offset' and
* try to find a checksum. This is used because we allow pages to
@ -963,7 +835,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
* @tree: IO tree used for locking out other users of the range
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
@ -973,8 +844,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
*/
void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
struct btrfs_inode *inode, u64 start,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state)
{
@ -986,7 +856,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
cachedp = cached_state;
while (1) {
lock_extent_bits(tree, start, end, cachedp);
lock_extent_bits(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@ -999,7 +869,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
refcount_dec(&cache->refs);
break;
}
unlock_extent_cached(tree, start, end, cachedp);
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}

View File

@ -52,11 +52,6 @@ enum {
BTRFS_ORDERED_DIRECT,
/* We had an io error when writing this out */
BTRFS_ORDERED_IOERR,
/*
* indicates whether this ordered extent has done its due diligence in
* updating the isize
*/
BTRFS_ORDERED_UPDATED_ISIZE,
/* Set when we have to truncate an extent */
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
@ -182,16 +177,13 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
struct btrfs_inode *inode, u64 start,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
int __init ordered_data_init(void);

View File

@ -383,7 +383,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (need_reserve) {
btrfs_block_rsv_release(fs_info, trans->block_rsv,
num_bytes);
num_bytes, NULL);
if (ret)
return ret;
}

View File

@ -1030,6 +1030,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
@ -1037,11 +1038,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
out_free_path:
btrfs_free_path(path);
out_free_root:
if (ret) {
free_extent_buffer(quota_root->node);
free_extent_buffer(quota_root->commit_root);
kfree(quota_root);
}
if (ret)
btrfs_put_root(quota_root);
out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
@ -1104,9 +1102,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
free_extent_buffer(quota_root->node);
free_extent_buffer(quota_root->commit_root);
kfree(quota_root);
btrfs_put_root(quota_root);
end_trans:
ret = btrfs_end_transaction(trans);
@ -3237,7 +3233,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
}
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
@ -3252,7 +3247,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
}
if (ret) {
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
@ -3263,9 +3257,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
fs_info->qgroup_rescan_running = true;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
@ -3326,8 +3317,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return 0;
}
@ -3339,9 +3333,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
running = fs_info->qgroup_rescan_running;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (!running)
@ -3363,9 +3355,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
mutex_unlock(&fs_info->qgroup_rescan_lock);
}
}
/*

View File

@ -206,7 +206,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
int table_size;
if (info->stripe_hash_table)
return 0;
@ -218,8 +217,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
* Try harder to allocate and fallback to vmalloc to lower the chance
* of a failing mount.
*/
table_size = sizeof(*table) + sizeof(*h) * num_entries;
table = kvzalloc(table_size, GFP_KERNEL);
table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
if (!table)
return -ENOMEM;
@ -1196,22 +1194,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
int p_stripe = -1;
int q_stripe = -1;
bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
int ret;
bio_list_init(&bio_list);
if (rbio->real_stripes - rbio->nr_data == 1) {
p_stripe = rbio->real_stripes - 1;
} else if (rbio->real_stripes - rbio->nr_data == 2) {
p_stripe = rbio->real_stripes - 2;
q_stripe = rbio->real_stripes - 1;
} else {
if (rbio->real_stripes - rbio->nr_data == 1)
has_qstripe = false;
else if (rbio->real_stripes - rbio->nr_data == 2)
has_qstripe = true;
else
BUG();
}
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
@ -1255,7 +1250,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
if (q_stripe != -1) {
if (has_qstripe) {
/*
* raid6, add the qstripe and call the
@ -2353,8 +2348,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
int p_stripe = -1;
int q_stripe = -1;
bool has_qstripe;
struct page *p_page = NULL;
struct page *q_page = NULL;
struct bio_list bio_list;
@ -2364,14 +2358,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bio_list_init(&bio_list);
if (rbio->real_stripes - rbio->nr_data == 1) {
p_stripe = rbio->real_stripes - 1;
} else if (rbio->real_stripes - rbio->nr_data == 2) {
p_stripe = rbio->real_stripes - 2;
q_stripe = rbio->real_stripes - 1;
} else {
if (rbio->real_stripes - rbio->nr_data == 1)
has_qstripe = false;
else if (rbio->real_stripes - rbio->nr_data == 2)
has_qstripe = true;
else
BUG();
}
if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
@ -2393,7 +2385,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
goto cleanup;
SetPageUptodate(p_page);
if (q_stripe != -1) {
if (has_qstripe) {
q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!q_page) {
__free_page(p_page);
@ -2416,8 +2408,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* then add the parity stripe */
pointers[stripe++] = kmap(p_page);
if (q_stripe != -1) {
if (has_qstripe) {
/*
* raid6, add the qstripe and call the
* library function to fill in our p/q

View File

@ -8,7 +8,7 @@
struct rcu_string {
struct rcu_head rcu;
char str[0];
char str[];
};
static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)

View File

@ -803,6 +803,15 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
kfree(ref);
kfree(ra);
goto out_unlock;
} else if (be->num_refs == 0) {
btrfs_err(fs_info,
"trying to do action %d for a bytenr that has 0 total references",
action);
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
goto out_unlock;
}
if (!parent) {

804
fs/btrfs/reflink.c Normal file
View File

@ -0,0 +1,804 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/blkdev.h>
#include <linux/iversion.h>
#include "compression.h"
#include "ctree.h"
#include "delalloc-space.h"
#include "reflink.h"
#include "transaction.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
const u64 destoff,
const u64 olen,
int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
if (!no_time_update)
inode->i_mtime = inode->i_ctime = current_time(inode);
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
*/
if (endoff > destoff + olen)
endoff = destoff + olen;
if (endoff > inode->i_size) {
i_size_write(inode, endoff);
btrfs_inode_safe_disk_i_size_write(inode, 0);
}
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_end_transaction(trans);
out:
return ret;
}
static int copy_inline_to_page(struct inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
const u64 datal,
const u8 comp_type)
{
const u64 block_size = btrfs_inode_sectorsize(inode);
const u64 range_end = file_offset + block_size - 1;
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
/*
* We have flushed and locked the ranges of the source and destination
* inodes, we also have locked the inodes, so we are safe to do a
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
block_size);
if (ret)
goto out;
page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
btrfs_alloc_write_mask(inode->i_mapping));
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
set_page_extent_mapped(page);
clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
if (comp_type == BTRFS_COMPRESS_NONE) {
char *map;
map = kmap(page);
memcpy(map, data_start, datal);
flush_dcache_page(page);
kunmap(page);
} else {
ret = btrfs_decompress(comp_type, data_start, page, 0,
inline_size, datal);
if (ret)
goto out_unlock;
flush_dcache_page(page);
}
/*
* If our inline data is smaller then the block/page size, then the
* remaining of the block/page is equivalent to zeroes. We had something
* like the following done:
*
* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
* $ sync # (or fsync)
* $ xfs_io -c "falloc 0 4K" file
* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
*
* So what's in the range [500, 4095] corresponds to zeroes.
*/
if (datal < block_size) {
char *map;
map = kmap(page);
memset(map + datal, 0, block_size - datal);
flush_dcache_page(page);
kunmap(page);
}
SetPageUptodate(page);
ClearPageChecked(page);
set_page_dirty(page);
out_unlock:
if (page) {
unlock_page(page);
put_page(page);
}
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
block_size, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
out:
extent_changeset_free(data_reserved);
return ret;
}
/*
* Deal with cloning of inline extents. We try to copy the inline extent from
* the source inode to destination inode when possible. When not possible we
* copy the inline extent's data into the respective page of the inode.
*/
static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
const u64 datal,
const u64 size,
const u8 comp_type,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
struct btrfs_root *root = BTRFS_I(dst)->root;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
int ret;
struct btrfs_key key;
if (new_key->offset > 0) {
ret = copy_inline_to_page(dst, new_key->offset, inline_data,
size, datal, comp_type);
goto out;
}
key.objectid = btrfs_ino(BTRFS_I(dst));
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
return ret;
} else if (ret > 0) {
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
else if (ret > 0)
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
key.type == BTRFS_EXTENT_DATA_KEY) {
/*
* There's an implicit hole at file offset 0, copy the
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
ret = copy_inline_to_page(dst, new_key->offset,
inline_data, size, datal,
comp_type);
goto out;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
/*
* If it's an inline extent replace it with the source inline
* extent, otherwise copy the source inline extent data into
* the respective page at the destination inode.
*/
if (btrfs_file_extent_type(path->nodes[0], ei) ==
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
ret = copy_inline_to_page(dst, new_key->offset, inline_data,
size, datal, comp_type);
goto out;
}
copy_inline_extent:
ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
if (i_size_read(dst) > datal) {
/*
* At the destination offset 0 we have either a hole, a regular
* extent or an inline extent larger then the one we want to
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
ret = copy_inline_to_page(dst, new_key->offset, inline_data,
size, datal, comp_type);
goto out;
}
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
* of the destination inode. We know we will drop or adjust at most one
* extent item in the destination root.
*
* 1 unit - adjusting old extent (we may have to split it)
* 1 unit - add new extent
* 1 unit - inode update
*/
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
if (ret)
goto out;
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
if (ret)
goto out;
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
inode_add_bytes(dst, datal);
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
out:
if (!ret && !trans) {
/*
* No transaction here means we copied the inline extent into a
* page of the destination inode.
*
* 1 unit to update inode item
*/
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
}
}
if (ret && trans) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
}
if (!ret)
*trans_out = trans;
return ret;
}
/**
* btrfs_clone() - clone a range from inode file to another
*
* @src: Inode to clone from
* @inode: Inode to clone to
* @off: Offset within source to start clone from
* @olen: Original length, passed by user, of range to clone
* @olen_aligned: Block-aligned value of olen
* @destoff: Offset within @inode to start clone
* @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
const u64 destoff, int no_time_update)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path = NULL;
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
char *buf = NULL;
struct btrfs_key key;
u32 nritems;
int slot;
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
if (!buf)
return ret;
path = btrfs_alloc_path();
if (!path) {
kvfree(buf);
return ret;
}
path->reada = READA_FORWARD;
/* Clone data */
key.objectid = btrfs_ino(BTRFS_I(src));
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = off;
while (1) {
u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
int type;
u32 size;
struct btrfs_key new_key;
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
u64 drop_start;
/* Note the key will change type as we walk through the tree */
path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
0, 0);
if (ret < 0)
goto out;
/*
* First search, if no extent item that starts at offset off was
* found but the previous item is an extent item, it's possible
* it might overlap our target range, therefore process it.
*/
if (key.offset == off && ret > 0 && path->slots[0] > 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0] - 1);
if (key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
goto out;
if (ret > 0)
break;
nritems = btrfs_header_nritems(path->nodes[0]);
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.type > BTRFS_EXTENT_DATA_KEY ||
key.objectid != btrfs_ino(BTRFS_I(src)))
break;
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
disko = btrfs_file_extent_disk_bytenr(leaf, extent);
diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
datao = btrfs_file_extent_offset(leaf, extent);
datal = btrfs_file_extent_num_bytes(leaf, extent);
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
/* Take upper bound, may be compressed */
datal = btrfs_file_extent_ram_bytes(leaf, extent);
}
/*
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
*/
if (key.offset + datal <= off) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
next_key_min_offset = key.offset + datal;
size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
btrfs_release_path(path);
path->leave_spinning = 0;
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = btrfs_ino(BTRFS_I(inode));
if (off <= key.offset)
new_key.offset = key.offset + destoff - off;
else
new_key.offset = destoff;
/*
* Deal with a hole that doesn't have an extent item that
* represents it (NO_HOLES feature enabled).
* This hole is either in the middle of the cloning range or at
* the beginning (fully overlaps it or partially overlaps it).
*/
if (new_key.offset != last_dest_end)
drop_start = last_dest_end;
else
drop_start = new_key.offset;
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
struct btrfs_clone_extent_info clone_info;
/*
* a | --- range to clone ---| b
* | ------------- extent ------------- |
*/
/* Subtract range b */
if (key.offset + datal > off + len)
datal = off + len - key.offset;
/* Subtract range a */
if (off > key.offset) {
datao += off - key.offset;
datal -= off - key.offset;
}
clone_info.disk_offset = disko;
clone_info.disk_len = diskl;
clone_info.data_offset = datao;
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.item_size = size;
ret = btrfs_punch_hole_range(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
if (ret)
goto out;
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
/*
* Inline extents always have to start at file offset 0
* and can never be bigger then the sector size. We can
* never clone only parts of an inline extent, since all
* reflink operations must start at a sector size aligned
* offset, and the length must be aligned too or end at
* the i_size (which implies the whole inlined data).
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
if (key.offset != 0 || datal > fs_info->sectorsize)
return -EUCLEAN;
ret = clone_copy_inline_extent(inode, path, &new_key,
drop_start, datal, size,
comp, buf, &trans);
if (ret)
goto out;
}
btrfs_release_path(path);
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode, last_dest_end,
destoff, olen, no_time_update);
if (ret)
goto out;
if (new_key.offset + datal >= destoff + len)
break;
btrfs_release_path(path);
key.offset = next_key_min_offset;
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto out;
}
}
ret = 0;
if (last_dest_end < destoff + len) {
/*
* We have an implicit hole that fully or partially overlaps our
* cloning range at its end. This means that we either have the
* NO_HOLES feature enabled or the implicit hole happened due to
* mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
path->leave_spinning = 0;
ret = btrfs_punch_hole_range(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
goto out;
ret = clone_finish_inode_update(trans, inode, destoff + len,
destoff, olen, no_time_update);
}
out:
btrfs_free_path(path);
kvfree(buf);
return ret;
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
} else if (inode1 == inode2 && loff2 < loff1) {
swap(loff1, loff2);
}
lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
/*
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
}
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
u64 i, tail_len, chunk_count;
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
spin_lock(&root_dst->root_item_lock);
if (root_dst->send_in_progress) {
btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
root_dst->root_key.objectid,
root_dst->send_in_progress);
spin_unlock(&root_dst->root_item_lock);
return -EAGAIN;
}
root_dst->dedupe_in_progress++;
spin_unlock(&root_dst->root_item_lock);
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
dst, dst_loff);
if (ret)
goto out;
loff += BTRFS_MAX_DEDUPE_LEN;
dst_loff += BTRFS_MAX_DEDUPE_LEN;
}
if (tail_len > 0)
ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
spin_unlock(&root_dst->root_item_lock);
return ret;
}
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
int wb_ret;
u64 len = olen;
u64 bs = fs_info->sb->s_blocksize;
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
* eof block into the middle of a file, which would result in corruption
* if the file size is not blocksize aligned. So we don't need to check
* for that case here.
*/
if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
if (destoff > inode->i_size) {
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
return ret;
/*
* We may have truncated the last block if the inode's size is
* not sector size aligned, so we need to wait for writeback to
* complete before proceeding further, otherwise we can race
* with cloning and attempt to increment a reference to an
* extent that no longer exists (writeback completed right after
* we found the previous extent covering eof and before we
* attempted to increment its reference count).
*/
ret = btrfs_wait_ordered_range(inode, wb_start,
destoff - wb_start);
if (ret)
return ret;
}
/*
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, off, inode, destoff, len);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
btrfs_double_extent_unlock(src, off, inode, destoff, len);
/*
* We may have copied an inline extent into a page of the destination
* range, so wait for writeback to complete before truncating pages
* from the page cache. This is a rare case.
*/
wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
ret = ret ? ret : wb_ret;
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
*/
truncate_inode_pages_range(&inode->i_data,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
return ret;
}
static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
if (file_in->f_path.mnt != file_out->f_path.mnt ||
inode_in->i_sb != inode_out->i_sb)
return -EXDEV;
}
/* Don't make the dst file partly checksummed */
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
/*
* Now that the inodes are locked, we need to start writeback ourselves
* and can not rely on the writeback from the VFS's generic helper
* generic_remap_file_range_prep() because:
*
* 1) For compression we must call filemap_fdatawrite_range() range
* twice (btrfs_fdatawrite_range() does it for us), and the generic
* helper only calls it once;
*
* 2) filemap_fdatawrite_range(), called by the generic helper only
* waits for the writeback to complete, i.e. for IO to be done, and
* not for the ordered extents to complete. We need to wait for them
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
/*
* Since we don't lock ranges, wait for ongoing lockless dio writes (as
* any in progress could create its ordered extents after we wait for
* existing ordered extents below).
*/
inode_dio_wait(inode_in);
if (!same_inode)
inode_dio_wait(inode_out);
/*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
* work at the whole extent level.
* NOCOW buffered write without data space reserved may not be able
* to fall back to CoW due to lack of data space, thus could cause
* data loss.
*
* Here we take a shortcut by flushing the whole inode, so that all
* nocow write should reach disk as nocow before we increase the
* reference of the extent. We could do better by only flushing NOCOW
* data, but that needs extra accounting.
*
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
ret = filemap_flush(inode_in->i_mapping);
if (ret < 0)
return ret;
ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)
return ret;
ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
wb_len);
if (ret < 0)
return ret;
return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
}
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *dst_inode = file_inode(dst_file);
bool same_inode = dst_inode == src_inode;
int ret;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (same_inode)
inode_lock(src_inode);
else
lock_two_nondirectories(src_inode, dst_inode);
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
if (ret < 0 || len == 0)
goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
if (same_inode)
inode_unlock(src_inode);
else
unlock_two_nondirectories(src_inode, dst_inode);
return ret < 0 ? ret : len;
}

12
fs/btrfs/reflink.h Normal file
View File

@ -0,0 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_REFLINK_H
#define BTRFS_REFLINK_H
#include <linux/fs.h>
loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
#endif /* BTRFS_REFLINK_H */

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,6 @@
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
uuid_le uuid;
u32 len;
int need_reset = 0;
@ -44,8 +43,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
sizeof(*item) - offsetof(struct btrfs_root_item,
generation_v2));
uuid_le_gen(&uuid);
memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
generate_random_guid(item->uuid);
}
}
@ -255,25 +253,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
root_key.objectid = key.offset;
key.offset++;
/*
* The root might have been inserted already, as before we look
* for orphan roots, log replay might have happened, which
* triggers a transaction commit and qgroup accounting, which
* in turn reads and inserts fs roots while doing backref
* walking.
*/
root = btrfs_lookup_fs_root(fs_info, root_key.objectid);
if (root) {
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
continue;
}
root = btrfs_read_fs_root(tree_root, &root_key);
root = btrfs_get_fs_root(fs_info, &root_key, false);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
@ -300,25 +280,12 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
continue;
}
err = btrfs_init_fs_root(root);
if (err) {
btrfs_free_fs_root(root);
break;
}
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(fs_info, root);
if (err) {
BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
btrfs_put_root(root);
}
btrfs_free_path(path);
@ -553,5 +520,5 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
}

View File

@ -149,7 +149,7 @@ struct scrub_parity {
*/
unsigned long *ebitmap;
unsigned long bitmap[0];
unsigned long bitmap[];
};
struct scrub_ctx {
@ -653,7 +653,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
local_root = btrfs_get_fs_root(fs_info, &root_key, true);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
@ -668,6 +668,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_put_root(local_root);
btrfs_release_path(swarn->path);
goto err;
}
@ -688,6 +689,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
ipath = init_ipath(4096, local_root, swarn->path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
@ -711,6 +713,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
free_ipath(ipath);
return 0;

View File

@ -5586,10 +5586,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
struct btrfs_path *path;
struct btrfs_root *root = sctx->send_root;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 extent_end;
u8 type;
int ret;
path = alloc_path_for_send();
@ -5609,18 +5606,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
goto out;
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], fi);
if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
extent_end = ALIGN(key.offset + size,
sctx->send_root->fs_info->sectorsize);
} else {
extent_end = key.offset +
btrfs_file_extent_num_bytes(path->nodes[0], fi);
}
sctx->cur_inode_last_extent = extent_end;
sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
out:
btrfs_free_path(path);
return ret;
@ -5674,16 +5660,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
break;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) ==
BTRFS_FILE_EXTENT_INLINE) {
u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
extent_end = ALIGN(key.offset + size,
root->fs_info->sectorsize);
} else {
extent_end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
}
extent_end = btrfs_file_extent_end(path);
if (extent_end <= start)
goto next;
if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
@ -5704,9 +5681,6 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_file_extent_item *fi;
u64 extent_end;
u8 type;
int ret = 0;
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
@ -5718,18 +5692,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
return ret;
}
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], fi);
if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
extent_end = ALIGN(key->offset + size,
sctx->send_root->fs_info->sectorsize);
} else {
extent_end = key->offset +
btrfs_file_extent_num_bytes(path->nodes[0], fi);
}
if (path->slots[0] == 0 &&
sctx->cur_inode_last_extent < key->offset) {
/*
@ -5755,7 +5717,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
else
ret = 0;
}
sctx->cur_inode_last_extent = extent_end;
sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
return ret;
}
@ -7066,7 +7028,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
int clone_sources_to_rollback = 0;
unsigned alloc_size;
int sort_clone_roots = 0;
int index;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@ -7193,11 +7154,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
clone_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(clone_root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
@ -7205,20 +7163,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (!btrfs_root_readonly(clone_root) ||
btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
btrfs_put_root(clone_root);
ret = -EPERM;
goto out;
}
if (clone_root->dedupe_in_progress) {
dedupe_in_progress_warn(clone_root);
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
btrfs_put_root(clone_root);
ret = -EAGAIN;
goto out;
}
clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
@ -7232,11 +7189,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(sctx->parent_root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(sctx->parent_root);
goto out;
}
@ -7246,20 +7200,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (!btrfs_root_readonly(sctx->parent_root) ||
btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
if (sctx->parent_root->dedupe_in_progress) {
dedupe_in_progress_warn(sctx->parent_root);
spin_unlock(&sctx->parent_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EAGAIN;
goto out;
}
spin_unlock(&sctx->parent_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
}
/*
@ -7267,7 +7217,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
* is behind the current send position. This is checked while searching
* for possible clone sources.
*/
sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
sctx->clone_roots[sctx->clone_roots_cnt++].root =
btrfs_grab_root(sctx->send_root);
/* We do a bsearch later */
sort(sctx->clone_roots, sctx->clone_roots_cnt,
@ -7352,18 +7303,24 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
}
if (sort_clone_roots) {
for (i = 0; i < sctx->clone_roots_cnt; i++)
for (i = 0; i < sctx->clone_roots_cnt; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
btrfs_put_root(sctx->clone_roots[i].root);
}
} else {
for (i = 0; sctx && i < clone_sources_to_rollback; i++)
for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
btrfs_put_root(sctx->clone_roots[i].root);
}
btrfs_root_dec_send_in_progress(send_root);
}
if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
btrfs_root_dec_send_in_progress(sctx->parent_root);
btrfs_put_root(sctx->parent_root);
}
kvfree(clone_sources_tmp);

View File

@ -10,6 +10,153 @@
#include "transaction.h"
#include "block-group.h"
/*
* HOW DOES SPACE RESERVATION WORK
*
* If you want to know about delalloc specifically, there is a separate comment
* for that with the delalloc code. This comment is about how the whole system
* works generally.
*
* BASIC CONCEPTS
*
* 1) space_info. This is the ultimate arbiter of how much space we can use.
* There's a description of the bytes_ fields with the struct declaration,
* refer to that for specifics on each field. Suffice it to say that for
* reservations we care about total_bytes - SUM(space_info->bytes_) when
* determining if there is space to make an allocation. There is a space_info
* for METADATA, SYSTEM, and DATA areas.
*
* 2) block_rsv's. These are basically buckets for every different type of
* metadata reservation we have. You can see the comment in the block_rsv
* code on the rules for each type, but generally block_rsv->reserved is how
* much space is accounted for in space_info->bytes_may_use.
*
* 3) btrfs_calc*_size. These are the worst case calculations we used based
* on the number of items we will want to modify. We have one for changing
* items, and one for inserting new items. Generally we use these helpers to
* determine the size of the block reserves, and then use the actual bytes
* values to adjust the space_info counters.
*
* MAKING RESERVATIONS, THE NORMAL CASE
*
* We call into either btrfs_reserve_data_bytes() or
* btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
* num_bytes we want to reserve.
*
* ->reserve
* space_info->bytes_may_reserve += num_bytes
*
* ->extent allocation
* Call btrfs_add_reserved_bytes() which does
* space_info->bytes_may_reserve -= num_bytes
* space_info->bytes_reserved += extent_bytes
*
* ->insert reference
* Call btrfs_update_block_group() which does
* space_info->bytes_reserved -= extent_bytes
* space_info->bytes_used += extent_bytes
*
* MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
*
* Assume we are unable to simply make the reservation because we do not have
* enough space
*
* -> __reserve_bytes
* create a reserve_ticket with ->bytes set to our reservation, add it to
* the tail of space_info->tickets, kick async flush thread
*
* ->handle_reserve_ticket
* wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
* on the ticket.
*
* -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
* Flushes various things attempting to free up space.
*
* -> btrfs_try_granting_tickets()
* This is called by anything that either subtracts space from
* space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
* space_info->total_bytes. This loops through the ->priority_tickets and
* then the ->tickets list checking to see if the reservation can be
* completed. If it can the space is added to space_info->bytes_may_use and
* the ticket is woken up.
*
* -> ticket wakeup
* Check if ->bytes == 0, if it does we got our reservation and we can carry
* on, if not return the appropriate error (ENOSPC, but can be EINTR if we
* were interrupted.)
*
* MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
*
* Same as the above, except we add ourselves to the
* space_info->priority_tickets, and we do not use ticket->wait, we simply
* call flush_space() ourselves for the states that are safe for us to call
* without deadlocking and hope for the best.
*
* THE FLUSHING STATES
*
* Generally speaking we will have two cases for each state, a "nice" state
* and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
* reduce the locking over head on the various trees, and even to keep from
* doing any work at all in the case of delayed refs. Each of these delayed
* things however hold reservations, and so letting them run allows us to
* reclaim space so we can make new reservations.
*
* FLUSH_DELAYED_ITEMS
* Every inode has a delayed item to update the inode. Take a simple write
* for example, we would update the inode item at write time to update the
* mtime, and then again at finish_ordered_io() time in order to update the
* isize or bytes. We keep these delayed items to coalesce these operations
* into a single operation done on demand. These are an easy way to reclaim
* metadata space.
*
* FLUSH_DELALLOC
* Look at the delalloc comment to get an idea of how much space is reserved
* for delayed allocation. We can reclaim some of this space simply by
* running delalloc, but usually we need to wait for ordered extents to
* reclaim the bulk of this space.
*
* FLUSH_DELAYED_REFS
* We have a block reserve for the outstanding delayed refs space, and every
* delayed ref operation holds a reservation. Running these is a quick way
* to reclaim space, but we want to hold this until the end because COW can
* churn a lot and we can avoid making some extent tree modifications if we
* are able to delay for as long as possible.
*
* ALLOC_CHUNK
* We will skip this the first time through space reservation, because of
* overcommit and we don't want to have a lot of useless metadata space when
* our worst case reservations will likely never come true.
*
* RUN_DELAYED_IPUTS
* If we're freeing inodes we're likely freeing checksums, file extent
* items, and extent tree items. Loads of space could be freed up by these
* operations, however they won't be usable until the transaction commits.
*
* COMMIT_TRANS
* may_commit_transaction() is the ultimate arbiter on whether we commit the
* transaction or not. In order to avoid constantly churning we do all the
* above flushing first and then commit the transaction as the last resort.
* However we need to take into account things like pinned space that would
* be freed, plus any delayed work we may not have gotten rid of in the case
* of metadata.
*
* OVERCOMMIT
*
* Because we hold so many reservations for metadata we will allow you to
* reserve more space than is currently free in the currently allocate
* metadata space. This only happens with metadata, data does not allow
* overcommitting.
*
* You can see the current logic for when we allow overcommit in
* btrfs_can_overcommit(), but it only applies to unallocated space. If there
* is no unallocated space to be had, all reservations are kept within the
* free space in the allocated metadata chunks.
*
* Because of overcommitting, you generally want to use the
* btrfs_can_overcommit() logic for metadata allocations, as it does the right
* thing with or without extra unallocated space.
*/
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
@ -159,25 +306,19 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
return (global->size << 1);
}
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
u64 used;
int factor;
/* Don't overcommit when in mixed mode. */
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
used = btrfs_space_info_used(space_info, true);
avail = atomic64_read(&fs_info->free_chunk_space);
/*
@ -198,6 +339,22 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
return avail;
}
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
u64 avail;
u64 used;
/* Don't overcommit when in mixed mode */
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
used = btrfs_space_info_used(space_info, true);
avail = calc_available_free_space(fs_info, space_info, flush);
if (used + bytes < space_info->total_bytes + avail)
return 1;
@ -232,6 +389,8 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
space_info,
ticket->bytes);
list_del_init(&ticket->list);
ASSERT(space_info->reclaim_size >= ticket->bytes);
space_info->reclaim_size -= ticket->bytes;
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@ -627,15 +786,26 @@ static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
struct reserve_ticket *ticket;
u64 used;
u64 avail;
u64 expected;
u64 to_reclaim = 0;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
avail = calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
used = btrfs_space_info_used(space_info, true);
/*
* We may be flushing because suddenly we have less space than we had
* before, and now we're well over-committed based on our current free
* space. If that's the case add in our overage so we make sure to put
* appropriate pressure on the flushing state machine.
*/
if (space_info->total_bytes + avail < used)
to_reclaim += used - (space_info->total_bytes + avail);
list_for_each_entry(ticket, &space_info->tickets, list)
to_reclaim += ticket->bytes;
list_for_each_entry(ticket, &space_info->priority_tickets, list)
to_reclaim += ticket->bytes;
if (to_reclaim)
return to_reclaim;
@ -1020,8 +1190,10 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
ASSERT(space_info->reclaim_size >= 0);
ticket.bytes = orig_bytes;
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
if (flush == BTRFS_RESERVE_FLUSH_ALL) {
list_add_tail(&ticket.list, &space_info->tickets);

View File

@ -54,6 +54,13 @@ struct btrfs_space_info {
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
/*
* Size of space that needs to be reclaimed in order to satisfy pending
* tickets
*/
u64 reclaim_size;
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.

View File

@ -244,7 +244,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
trans->aborted = errno;
WRITE_ONCE(trans->aborted, errno);
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
if (!trans->dirty && list_empty(&trans->new_bgs)) {
@ -873,7 +873,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
#endif
case Opt_err:
btrfs_info(info, "unrecognized mount option '%s'", p);
btrfs_err(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
@ -1024,11 +1024,11 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
return error;
}
static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid)
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *fs_root;
struct btrfs_root *fs_root = NULL;
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
@ -1096,9 +1096,10 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.objectid = subvol_objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
fs_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
fs_root = NULL;
goto err;
}
@ -1143,6 +1144,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
ptr[0] = '/';
btrfs_release_path(path);
}
btrfs_put_root(fs_root);
fs_root = NULL;
}
btrfs_free_path(path);
@ -1155,6 +1158,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
return name;
err:
btrfs_put_root(fs_root);
btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
@ -1438,8 +1442,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
goto out;
}
}
subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
subvol_objectid);
subvol_name = btrfs_get_subvol_name_from_objectid(
btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
@ -1518,14 +1522,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
* it for searching for existing supers, so this lets us do that and
* then open_ctree will properly initialize everything later.
* then open_ctree will properly initialize the file system specific
* settings later. btrfs_init_fs_info initializes the static elements
* of the fs_info (locks and such) to make cleanup easier if we find a
* superblock with our given fs_devices later on at sget() time.
*/
fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
}
btrfs_init_fs_info(fs_info);
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
@ -1571,7 +1578,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
if (s->s_root) {
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
btrfs_free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
@ -1594,7 +1601,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
free_fs_info(fs_info);
btrfs_free_fs_info(fs_info);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
@ -2170,7 +2177,7 @@ static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
kill_anon_super(sb);
free_fs_info(fs_info);
btrfs_free_fs_info(fs_info);
}
static struct file_system_type btrfs_fs_type = {
@ -2203,7 +2210,7 @@ static int btrfs_control_open(struct inode *inode, struct file *file)
}
/*
* used by btrfsctl to scan devices when no FS is mounted
* Used by /dev/btrfs-control for devices ioctls.
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)

View File

@ -155,7 +155,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@ -295,7 +295,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
return snprintf(buf, PAGE_SIZE, "0\n");
return scnprintf(buf, PAGE_SIZE, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@ -310,12 +310,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
(i == 0 ? "" : " "), btrfs_super_csum_name(i));
}
ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@ -350,7 +350,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
return scnprintf(buf, PAGE_SIZE, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@ -361,7 +361,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%d\n",
return scnprintf(buf, PAGE_SIZE, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@ -372,7 +372,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
return scnprintf(buf, PAGE_SIZE, "%lld\n",
fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@ -383,7 +383,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
return scnprintf(buf, PAGE_SIZE, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@ -394,7 +394,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
return scnprintf(buf, PAGE_SIZE, "%lld\n",
fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@ -405,7 +405,7 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n",
return scnprintf(buf, PAGE_SIZE, "%u\n",
READ_ONCE(fs_info->discard_ctl.iops_limit));
}
@ -435,7 +435,7 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n",
return scnprintf(buf, PAGE_SIZE, "%u\n",
READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
@ -465,7 +465,7 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%llu\n",
return scnprintf(buf, PAGE_SIZE, "%llu\n",
READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
@ -530,7 +530,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
return snprintf(buf, PAGE_SIZE, "%llu\n", val);
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@ -576,7 +576,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", val);
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static struct attribute *raid_attrs[] = {
@ -613,7 +613,7 @@ static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
return snprintf(buf, PAGE_SIZE, "%lld\n", val);
return scnprintf(buf, PAGE_SIZE, "%lld\n", val);
}
SPACE_INFO_ATTR(flags);
@ -670,7 +670,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@ -718,7 +718,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@ -728,8 +728,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n",
fs_info->super_copy->sectorsize);
return scnprintf(buf, PAGE_SIZE, "%u\n",
fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@ -739,8 +739,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n",
fs_info->super_copy->sectorsize);
return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@ -752,7 +751,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@ -790,7 +789,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%pU\n",
return scnprintf(buf, PAGE_SIZE, "%pU\n",
fs_info->fs_devices->metadata_uuid);
}
@ -802,7 +801,7 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
return snprintf(buf, PAGE_SIZE, "%s (%s)\n",
return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
btrfs_super_csum_name(csum_type),
crypto_shash_driver_name(fs_info->csum_shash));
}
@ -960,7 +959,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@ -992,7 +991,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
continue;
name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
len += snprintf(str + len, bufsize - len, "%s%s",
len += scnprintf(str + len, bufsize - len, "%s%s",
len ? "," : "", name);
}
@ -1149,7 +1148,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
/* when one_device is NULL, it removes all device links */
int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
@ -1201,11 +1200,11 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
int val;
@ -1214,9 +1213,9 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show);
BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
struct kobj_attribute *a,
@ -1228,7 +1227,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
@ -1241,7 +1240,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
@ -1269,7 +1268,7 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
int error = 0;
@ -1371,7 +1370,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
if (!fs_devs->devices_kobj) {
btrfs_err(fs_devs->fs_info,
"failed to init sysfs device interface");
kobject_put(&fs_devs->fsid_kobj);
btrfs_sysfs_remove_fsid(fs_devs);
return -ENOMEM;
}
@ -1395,13 +1394,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
btrfs_set_fs_info_ptr(fs_info);
error = btrfs_sysfs_add_device_link(fs_devs, NULL);
error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
btrfs_sysfs_rm_device_link(fs_devs, NULL);
btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
return error;
}

View File

@ -14,9 +14,9 @@ enum btrfs_feature_set {
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);

View File

@ -120,6 +120,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
kfree(fs_info);
return NULL;
}
INIT_LIST_HEAD(&fs_info->fs_devices->devices);
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
GFP_KERNEL);
if (!fs_info->super_copy) {
@ -128,39 +130,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
return NULL;
}
btrfs_init_fs_info(fs_info);
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
if (init_srcu_struct(&fs_info->subvol_srcu)) {
kfree(fs_info->fs_devices);
kfree(fs_info->super_copy);
kfree(fs_info);
return NULL;
}
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->qgroup_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
mutex_init(&fs_info->qgroup_rescan_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
fs_info->running_transaction = NULL;
fs_info->qgroup_tree = RB_ROOT;
fs_info->qgroup_ulist = NULL;
atomic64_set(&fs_info->tree_mod_seq, 0);
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->fs_devices->devices);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
extent_map_tree_init(&fs_info->mapping_tree);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@ -210,8 +183,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->super_copy);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->fs_devices);
kfree(fs_info);
}
@ -223,11 +197,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
if (root->node) {
/* One for allocate_extent_buffer */
free_extent_buffer(root->node);
}
kfree(root);
btrfs_put_root(root);
}
struct btrfs_block_group *

View File

@ -507,6 +507,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
test_err("couldn't insert fs root %d", ret);
goto out;
}
btrfs_put_root(tmp_root);
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
@ -521,6 +522,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
test_err("couldn't insert fs root %d", ret);
goto out;
}
btrfs_put_root(tmp_root);
test_msg("running qgroup tests");
ret = test_no_shared_qgroup(root, sectorsize, nodesize);

View File

@ -221,7 +221,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
WARN_ON_ONCE(!list_empty(&trans->new_bgs));
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved, NULL);
trans->chunk_bytes_reserved = 0;
}
@ -243,7 +243,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (cur_trans->aborted) {
if (TRANS_ABORTED(cur_trans)) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
@ -336,6 +336,8 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@ -459,7 +461,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!trans->aborted);
!TRANS_ABORTED(trans));
}
/* wait for commit against the current transaction to become unblocked
@ -478,7 +480,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
cur_trans->aborted);
TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&fs_info->trans_lock);
@ -673,7 +675,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
alloc_fail:
if (num_bytes)
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
num_bytes);
num_bytes, NULL);
reserve_fail:
btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
@ -896,7 +898,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved);
trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
}
@ -937,7 +939,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
if (trans->aborted ||
if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
err = -EIO;
@ -1262,8 +1264,10 @@ void btrfs_add_dead_root(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list))
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
@ -1477,7 +1481,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 index = 0;
u64 objectid;
u64 root_flags;
uuid_le new_uuid;
ASSERT(pending->path);
path = pending->path;
@ -1570,8 +1573,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_generation_v2(new_root_item,
trans->transid);
uuid_le_gen(&new_uuid);
memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
generate_random_guid(new_root_item->uuid);
memcpy(new_root_item->parent_uuid, root->root_item.uuid,
BTRFS_UUID_SIZE);
if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
@ -1633,7 +1635,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
pending->snap = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret);
@ -1682,7 +1684,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
@ -1794,7 +1797,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
struct btrfs_transaction *trans)
{
wait_event(fs_info->transaction_blocked_wait,
trans->state >= TRANS_STATE_COMMIT_START || trans->aborted);
trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(trans));
}
/*
@ -1806,7 +1810,8 @@ static void wait_current_trans_commit_start_and_unblock(
struct btrfs_transaction *trans)
{
wait_event(fs_info->transaction_wait,
trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted);
trans->state >= TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(trans));
}
/*
@ -2026,7 +2031,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trans->dirty = true;
/* Stop the commit early if ->aborted is set */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
btrfs_end_transaction(trans);
return ret;
@ -2100,7 +2105,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_for_commit(cur_trans);
if (unlikely(cur_trans->aborted))
if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
@ -2119,7 +2124,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans);
ret = prev_trans->aborted;
ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
if (ret)
@ -2173,8 +2178,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
/* ->aborted might be set after the previous check, so check it */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
goto scrub_continue;
}
@ -2191,10 +2195,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* core function of the snapshot creation.
*/
ret = create_pending_snapshots(trans);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret)
goto unlock_reloc;
/*
* We insert the dir indexes of the snapshots and update the inode
@ -2207,16 +2209,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret)
goto unlock_reloc;
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret)
goto unlock_reloc;
/*
* make sure none of the code above managed to slip in a
@ -2242,11 +2240,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_lock(&fs_info->tree_log_mutex);
ret = commit_fs_roots(trans);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret)
goto unlock_tree_log;
/*
* Since the transaction is done, we can apply the pending changes
@ -2264,39 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* new delayed refs. Must handle them or qgroup can be wrong.
*/
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret)
goto unlock_tree_log;
/*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
ret = btrfs_qgroup_account_extents(trans);
if (ret < 0) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret < 0)
goto unlock_tree_log;
ret = commit_cowonly_roots(trans);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
if (ret)
goto unlock_tree_log;
/*
* The tasks which save the space cache and inode cache may also
* update ->aborted, check it.
*/
if (unlikely(READ_ONCE(cur_trans->aborted))) {
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
goto unlock_tree_log;
}
btrfs_prepare_extent_commit(fs_info);
@ -2343,6 +2327,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
/*
* reloc_mutex has been unlocked, tree_log_mutex is still held
* but we can't jump to unlock_tree_log causing double unlock
*/
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@ -2391,6 +2379,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
unlock_tree_log:
mutex_unlock(&fs_info->tree_log_mutex);
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
@ -2434,13 +2426,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
if (root->ino_cache_inode) {
iput(root->ino_cache_inode);
root->ino_cache_inode = NULL;
}
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
ret = btrfs_drop_snapshot(root, NULL, 0, 0);
ret = btrfs_drop_snapshot(root, 0, 0);
else
ret = btrfs_drop_snapshot(root, NULL, 1, 0);
ret = btrfs_drop_snapshot(root, 1, 0);
btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
}

View File

@ -71,6 +71,7 @@ struct btrfs_transaction {
*/
struct list_head io_bgs;
struct list_head dropped_roots;
struct extent_io_tree pinned_extents;
/*
* we need to make sure block group deletion doesn't race with
@ -115,6 +116,10 @@ struct btrfs_trans_handle {
struct btrfs_block_rsv *orig_rsv;
refcount_t use_count;
unsigned int type;
/*
* Error code of transaction abort, set outside of locks and must use
* the READ_ONCE/WRITE_ONCE access
*/
short aborted;
bool adding_csums;
bool allocating_chunk;
@ -126,6 +131,14 @@ struct btrfs_trans_handle {
struct list_head new_bgs;
};
/*
* The abort status can be changed between calls and is not protected by locks.
* This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
* set to a non-zero value it does not change, so the macro should be in checks
* but is not necessary for further reads of the value.
*/
#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;

View File

@ -18,6 +18,8 @@
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
#include "block-group.h"
#include "space-info.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@ -94,8 +96,8 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
const loff_t start,
const loff_t end,
u64 start,
u64 end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@ -311,7 +313,7 @@ static int process_one_buffer(struct btrfs_root *log,
}
if (wc->pin)
ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
@ -830,6 +832,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
extent_end - start);
if (ret)
goto out;
inode_add_bytes(inode, nbytes);
update_inode:
ret = btrfs_update_inode(trans, root, inode);
@ -2659,18 +2666,39 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
return ret;
}
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
return;
}
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->reserved -= fs_info->nodesize;
cache->space_info->bytes_reserved -= fs_info->nodesize;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 root_owner;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
struct extent_buffer *parent;
u32 blocksize;
int ret = 0;
@ -2690,9 +2718,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
next = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(next))
return PTR_ERR(next);
@ -2720,18 +2745,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
bytenr, blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
}
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
}
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_pin_reserved_extent(fs_info,
bytenr, blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
unaccount_log_buffer(fs_info, bytenr);
}
}
free_extent_buffer(next);
@ -2762,7 +2785,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 root_owner;
int i;
int slot;
int ret;
@ -2775,13 +2797,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
struct extent_buffer *parent;
if (path->nodes[*level] == root->node)
parent = path->nodes[*level];
else
parent = path->nodes[*level + 1];
root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
@ -2799,17 +2814,18 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
return ret;
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
}
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_pin_reserved_extent(fs_info,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
return ret;
unaccount_log_buffer(fs_info,
path->nodes[*level]->start);
}
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@ -2880,15 +2896,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
next->start, next->len);
if (ret)
goto out;
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info, next->start);
}
ret = btrfs_pin_reserved_extent(fs_info, next->start,
next->len);
if (ret)
goto out;
}
}
@ -3283,8 +3299,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
free_extent_buffer(log->node);
kfree(log);
btrfs_put_root(log);
}
/*
@ -4518,13 +4533,15 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
struct btrfs_path *path,
const u64 start,
const u64 end)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
u64 prev_extent_end = 0;
u64 prev_extent_end = start;
int ret;
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
@ -4532,16 +4549,21 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (ret > 0 && path->slots[0] > 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
while (true) {
struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf = path->nodes[0];
u64 len;
u64 extent_end;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
@ -4558,9 +4580,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
extent_end = btrfs_file_extent_end(path);
if (extent_end <= start)
goto next_slot;
/* We have a hole, log it. */
if (prev_extent_end < key.offset) {
const u64 hole_len = key.offset - prev_extent_end;
u64 hole_len;
if (key.offset >= end)
hole_len = end - prev_extent_end;
else
hole_len = key.offset - prev_extent_end;
/*
* Release the path to avoid deadlocks with other code
@ -4590,27 +4621,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
}
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
len = btrfs_file_extent_ram_bytes(leaf, extent);
prev_extent_end = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
len = btrfs_file_extent_num_bytes(leaf, extent);
prev_extent_end = key.offset + len;
}
prev_extent_end = min(extent_end, end);
if (extent_end >= end)
break;
next_slot:
path->slots[0]++;
cond_resched();
}
if (prev_extent_end < i_size) {
if (prev_extent_end < end && prev_extent_end < i_size) {
u64 hole_len;
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
hole_len = min(ALIGN(i_size, fs_info->sectorsize), end);
hole_len -= prev_extent_end;
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0, 0,
hole_len, 0, hole_len,
@ -4938,6 +4962,178 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
return ret;
}
static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_key *min_key,
const struct btrfs_key *max_key,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
const bool recursive_logging,
const int inode_only,
const u64 start,
const u64 end,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
int ret;
/*
* We must make sure we don't copy extent items that are entirely out of
* the range [start, end - 1]. This is not just an optimization to avoid
* copying but also needed to avoid a corruption where we end up with
* file extent items in the log tree that have overlapping ranges - this
* can happen if we race with ordered extent completion for ranges that
* are outside our target range. For example we copy an extent item and
* when we move to the next leaf, that extent was trimmed and a new one
* covering a subrange of it, but with a higher key, was inserted - we
* would then copy this other extent too, resulting in a log tree with
* 2 extent items that represent overlapping ranges.
*
* We can copy the entire extents at the range bondaries however, even
* if they cover an area outside the target range. That's ok.
*/
while (1) {
ret = btrfs_search_forward(root, min_key, path, trans->transid);
if (ret < 0)
return ret;
if (ret > 0) {
ret = 0;
break;
}
again:
/* Note, ins_nr might be > 0 here, cleanup outside the loop */
if (min_key->objectid != max_key->objectid)
break;
if (min_key->type > max_key->type)
break;
if (min_key->type == BTRFS_INODE_ITEM_KEY)
*need_log_inode_item = false;
if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid &&
!recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], min_key, inode,
&other_ino, &other_parent);
if (ret < 0) {
return ret;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
if (ins_nr > 0) {
ins_nr++;
} else {
ins_nr = 1;
ins_start_slot = path->slots[0];
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
inode_only, logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
ret = log_conflicting_inodes(trans, root, path,
ctx, other_ino, other_parent);
if (ret)
return ret;
btrfs_release_path(path);
goto next_key;
}
}
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
goto next_slot;
}
if (min_key->type == BTRFS_EXTENT_DATA_KEY) {
const u64 extent_end = btrfs_file_extent_end(path);
if (extent_end <= start) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path,
path, ins_start_slot,
ins_nr, inode_only,
logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
}
goto next_slot;
}
if (extent_end >= end) {
ins_nr++;
if (ins_nr == 1)
ins_start_slot = path->slots[0];
break;
}
}
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
goto next_slot;
} else if (!ins_nr) {
ins_start_slot = path->slots[0];
ins_nr = 1;
goto next_slot;
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0)
return ret;
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
path->slots[0]++;
if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key_to_cpu(path->nodes[0], min_key,
path->slots[0]);
goto again;
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
}
btrfs_release_path(path);
next_key:
if (min_key->offset < (u64)-1) {
min_key->offset++;
} else if (min_key->type < max_key->type) {
min_key->type++;
min_key->offset = 0;
} else {
break;
}
}
if (ins_nr)
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
return ret;
}
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@ -4955,8 +5151,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
const loff_t start,
const loff_t end,
u64 start,
u64 end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@ -4967,9 +5163,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
int err = 0;
int ret;
int nritems;
int ins_start_slot = 0;
int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@ -4987,6 +5180,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
start = ALIGN_DOWN(start, fs_info->sectorsize);
end = ALIGN(end, fs_info->sectorsize);
min_key.objectid = ino;
min_key.type = BTRFS_INODE_ITEM_KEY;
min_key.offset = 0;
@ -5100,139 +5296,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
goto out_unlock;
}
while (1) {
ins_nr = 0;
ret = btrfs_search_forward(root, &min_key,
path, trans->transid);
if (ret < 0) {
err = ret;
goto out_unlock;
}
if (ret != 0)
break;
again:
/* note, ins_nr might be > 0 here, cleanup outside the loop */
if (min_key.objectid != ino)
break;
if (min_key.type > max_key.type)
break;
if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false;
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid &&
!recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
&other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
if (ins_nr > 0) {
ins_nr++;
} else {
ins_nr = 1;
ins_start_slot = path->slots[0];
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
err = log_conflicting_inodes(trans, root, path,
ctx, other_ino, other_parent);
if (err)
goto out_unlock;
btrfs_release_path(path);
goto next_key;
}
}
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
goto next_slot;
}
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
goto next_slot;
} else if (!ins_nr) {
ins_start_slot = path->slots[0];
ins_nr = 1;
goto next_slot;
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
nritems = btrfs_header_nritems(path->nodes[0]);
path->slots[0]++;
if (path->slots[0] < nritems) {
btrfs_item_key_to_cpu(path->nodes[0], &min_key,
path->slots[0]);
goto again;
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
}
btrfs_release_path(path);
next_key:
if (min_key.offset < (u64)-1) {
min_key.offset++;
} else if (min_key.type < max_key.type) {
min_key.type++;
min_key.offset = 0;
} else {
break;
}
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
}
err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
recursive_logging, inode_only,
start, end, ctx, &need_log_inode_item);
if (err)
goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
@ -5243,7 +5312,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
err = btrfs_log_holes(trans, root, inode, path);
err = btrfs_log_holes(trans, root, inode, path, start, end);
if (err)
goto out_unlock;
}
@ -6145,7 +6214,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
log = btrfs_read_fs_root(log_root_tree, &found_key);
log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_handle_fs_error(fs_info, ret,
@ -6157,7 +6226,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
tmp_key.type = BTRFS_ROOT_ITEM_KEY;
tmp_key.offset = (u64)-1;
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
@ -6173,12 +6242,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
* each subsequent pass.
*/
if (ret == -ENOENT)
ret = btrfs_pin_extent_for_log_replay(fs_info,
ret = btrfs_pin_extent_for_log_replay(trans,
log->node->start,
log->node->len);
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
btrfs_put_root(log);
if (!ret)
goto next;
@ -6214,9 +6281,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
}
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
btrfs_put_root(wc.replay_dest);
btrfs_put_root(log);
if (ret)
goto error;
@ -6247,10 +6313,9 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
if (ret)
return ret;
free_extent_buffer(log_root_tree->node);
log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
kfree(log_root_tree);
btrfs_put_root(log_root_tree);
return 0;
error:

View File

@ -246,9 +246,53 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
return ret;
}
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
u64))
/*
* Check if there's an matching subvolume for given UUID
*
* Return:
* 0 check succeeded, the entry is not outdated
* > 0 if the check failed, the caller should remove the entry
* < 0 if an error occurred
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
u8 *uuid, u8 type, u64 subvolid)
{
struct btrfs_key key;
int ret = 0;
struct btrfs_root *subvol_root;
if (type != BTRFS_UUID_KEY_SUBVOL &&
type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
goto out;
key.objectid = subvolid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
subvol_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(subvol_root)) {
ret = PTR_ERR(subvol_root);
if (ret == -ENOENT)
ret = 1;
goto out;
}
switch (type) {
case BTRFS_UUID_KEY_SUBVOL:
if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
ret = 1;
break;
case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
if (memcmp(uuid, subvol_root->root_item.received_uuid,
BTRFS_UUID_SIZE))
ret = 1;
break;
}
btrfs_put_root(subvol_root);
out:
return ret;
}
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
@ -278,6 +322,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
}
while (1) {
if (btrfs_fs_closing(fs_info)) {
ret = -EINTR;
goto out;
}
cond_resched();
leaf = path->nodes[0];
slot = path->slots[0];
@ -305,7 +353,8 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
read_extent_buffer(leaf, &subid_le, offset,
sizeof(subid_le));
subid_cpu = le64_to_cpu(subid_le);
ret = check_func(fs_info, uuid, key.type, subid_cpu);
ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
key.type, subid_cpu);
if (ret < 0)
goto out;
if (ret > 0) {

File diff suppressed because it is too large Load Diff

View File

@ -17,8 +17,6 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
@ -209,6 +207,10 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_REGULAR,
};
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
u8 metadata_uuid[BTRFS_FSID_SIZE];
@ -260,6 +262,8 @@ struct btrfs_fs_devices {
struct kobject *devices_kobj;
struct kobject *devinfo_kobj;
struct completion kobj_unregister;
enum btrfs_chunk_allocation_policy chunk_alloc_policy;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@ -461,7 +465,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
@ -474,7 +478,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
@ -484,6 +487,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)

View File

@ -43,6 +43,16 @@ static inline void guid_copy(guid_t *dst, const guid_t *src)
memcpy(dst, src, sizeof(guid_t));
}
static inline void import_guid(guid_t *dst, const __u8 *src)
{
memcpy(dst, src, sizeof(guid_t));
}
static inline void export_guid(__u8 *dst, const guid_t *src)
{
memcpy(dst, src, sizeof(guid_t));
}
static inline bool guid_is_null(const guid_t *guid)
{
return guid_equal(guid, &guid_null);
@ -58,12 +68,23 @@ static inline void uuid_copy(uuid_t *dst, const uuid_t *src)
memcpy(dst, src, sizeof(uuid_t));
}
static inline void import_uuid(uuid_t *dst, const __u8 *src)
{
memcpy(dst, src, sizeof(uuid_t));
}
static inline void export_uuid(__u8 *dst, const uuid_t *src)
{
memcpy(dst, src, sizeof(uuid_t));
}
static inline bool uuid_is_null(const uuid_t *uuid)
{
return uuid_equal(uuid, &uuid_null);
}
void generate_random_uuid(unsigned char uuid[16]);
void generate_random_guid(unsigned char guid[16]);
extern void guid_gen(guid_t *u);
extern void uuid_gen(uuid_t *u);
@ -77,7 +98,6 @@ int guid_parse(const char *uuid, guid_t *u);
int uuid_parse(const char *uuid, uuid_t *u);
/* backwards compatibility, don't use in new code */
#define uuid_le_gen(u) guid_gen(u)
#define uuid_le_to_bin(guid, u) guid_parse(guid, u)
static inline int uuid_le_cmp(const guid_t u1, const guid_t u2)

View File

@ -81,13 +81,14 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS);
#define show_extent_io_tree_owner(owner) \
__print_symbolic(owner, \
{ IO_TREE_FS_INFO_FREED_EXTENTS0, "FREED_EXTENTS0" }, \
{ IO_TREE_FS_INFO_FREED_EXTENTS1, "FREED_EXTENTS1" }, \
{ IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS" }, \
{ IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS" }, \
{ IO_TREE_INODE_IO, "INODE_IO" }, \
{ IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE" }, \
{ IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS" }, \
{ IO_TREE_TRANS_DIRTY_PAGES, "TRANS_DIRTY_PAGES" }, \
{ IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES" }, \
{ IO_TREE_INODE_FILE_EXTENT, "INODE_FILE_EXTENT" }, \
{ IO_TREE_SELFTEST, "SELFTEST" })
#define BTRFS_GROUP_FLAGS \
@ -468,7 +469,6 @@ DEFINE_EVENT(
{ (1 << BTRFS_ORDERED_PREALLOC), "PREALLOC" }, \
{ (1 << BTRFS_ORDERED_DIRECT), "DIRECT" }, \
{ (1 << BTRFS_ORDERED_IOERR), "IOERR" }, \
{ (1 << BTRFS_ORDERED_UPDATED_ISIZE), "UPDATED_ISIZE" }, \
{ (1 << BTRFS_ORDERED_TRUNCATED), "TRUNCATED" })

View File

@ -36,17 +36,24 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_DEVICE_PATH_NAME_MAX 1024
#define BTRFS_SUBVOL_NAME_MAX 4039
#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
/*
* Deprecated since 5.7:
*
* BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
*/
#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
#define BTRFS_DEVICE_SPEC_BY_ID (1ULL << 3)
#define BTRFS_SUBVOL_SPEC_BY_ID (1ULL << 4)
#define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED \
(BTRFS_SUBVOL_CREATE_ASYNC | \
BTRFS_SUBVOL_RDONLY | \
(BTRFS_SUBVOL_RDONLY | \
BTRFS_SUBVOL_QGROUP_INHERIT | \
BTRFS_DEVICE_SPEC_BY_ID)
BTRFS_DEVICE_SPEC_BY_ID | \
BTRFS_SUBVOL_SPEC_BY_ID)
#define BTRFS_FSID_SIZE 16
#define BTRFS_UUID_SIZE 16
@ -97,16 +104,29 @@ struct btrfs_ioctl_qgroup_limit_args {
};
/*
* flags for subvolumes
* Arguments for specification of subvolumes or devices, supporting by-name or
* by-id and flags
*
* Used by:
* struct btrfs_ioctl_vol_args_v2.flags
* The set of supported flags depends on the ioctl
*
* BTRFS_SUBVOL_RDONLY is also provided/consumed by the following ioctls:
* - BTRFS_IOC_SUBVOL_GETFLAGS
* - BTRFS_IOC_SUBVOL_SETFLAGS
*/
/* Supported flags for BTRFS_IOC_RM_DEV_V2 */
#define BTRFS_DEVICE_REMOVE_ARGS_MASK \
(BTRFS_DEVICE_SPEC_BY_ID)
/* Supported flags for BTRFS_IOC_SNAP_CREATE_V2 and BTRFS_IOC_SUBVOL_CREATE_V2 */
#define BTRFS_SUBVOL_CREATE_ARGS_MASK \
(BTRFS_SUBVOL_RDONLY | \
BTRFS_SUBVOL_QGROUP_INHERIT)
/* Supported flags for BTRFS_IOC_SNAP_DESTROY_V2 */
#define BTRFS_SUBVOL_DELETE_ARGS_MASK \
(BTRFS_SUBVOL_SPEC_BY_ID)
struct btrfs_ioctl_vol_args_v2 {
__s64 fd;
__u64 transid;
@ -121,6 +141,7 @@ struct btrfs_ioctl_vol_args_v2 {
union {
char name[BTRFS_SUBVOL_NAME_MAX + 1];
__u64 devid;
__u64 subvolid;
};
};
@ -949,5 +970,7 @@ enum btrfs_err_code {
struct btrfs_ioctl_get_subvol_rootref_args)
#define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \
struct btrfs_ioctl_ino_lookup_user_args)
#define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \
struct btrfs_ioctl_vol_args_v2)
#endif /* _UAPI_LINUX_BTRFS_H */

View File

@ -40,6 +40,16 @@ void generate_random_uuid(unsigned char uuid[16])
}
EXPORT_SYMBOL(generate_random_uuid);
void generate_random_guid(unsigned char guid[16])
{
get_random_bytes(guid, 16);
/* Set GUID version to 4 --- truly random generation */
guid[7] = (guid[7] & 0x0F) | 0x40;
/* Set the GUID variant to DCE */
guid[8] = (guid[8] & 0x3F) | 0x80;
}
EXPORT_SYMBOL(generate_random_guid);
static void __uuid_gen_common(__u8 b[16])
{
prandom_bytes(b, 16);