From b7a0365ec7a0fb1d39113846fd34038af68ebd01 Mon Sep 17 00:00:00 2001
From: Daniel Dressler <danieru.dressler@gmail.com>
Date: Wed, 12 Nov 2014 13:43:09 +0900
Subject: [PATCH 01/93] Btrfs: ctree: reduce args where only fs_info used

This patch is part of a larger project to cleanup btrfs's internal usage
of struct btrfs_root. Many functions take btrfs_root only to grab a
pointer to fs_info.

This causes programmers to ponder which root can be passed. Since only
the fs_info is read affected functions can accept any root, except this
is only obvious upon inspection.

This patch reduces the specificty of such functions to accept the
fs_info directly.

This patch does not address the two functions in ctree.c (insert_ptr,
and split_item) which only use root for BUG_ONs in ctree.c

This patch affects the following functions:
  1) fixup_low_keys
  2) btrfs_set_item_key_safe

Signed-off-by: Daniel Dressler <danieru.dressler@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/ctree.c     | 27 +++++++++++++++------------
 fs/btrfs/ctree.h     |  3 ++-
 fs/btrfs/file-item.c |  2 +-
 fs/btrfs/file.c      |  8 ++++----
 4 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 993642199326..8c03c9222154 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3126,7 +3126,8 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
  * higher levels
  *
  */
-static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+			   struct btrfs_path *path,
 			   struct btrfs_disk_key *key, int level)
 {
 	int i;
@@ -3137,7 +3138,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
 		if (!path->nodes[i])
 			break;
 		t = path->nodes[i];
-		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
+		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
@@ -3151,7 +3152,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path,
 			     struct btrfs_key *new_key)
 {
 	struct btrfs_disk_key disk_key;
@@ -3173,7 +3175,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
 	btrfs_set_item_key(eb, &disk_key, slot);
 	btrfs_mark_buffer_dirty(eb);
 	if (slot == 0)
-		fixup_low_keys(root, path, &disk_key, 1);
+		fixup_low_keys(fs_info, path, &disk_key, 1);
 }
 
 /*
@@ -3931,7 +3933,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		clean_tree_block(trans, root, right);
 
 	btrfs_item_key(right, &disk_key, 0);
-	fixup_low_keys(root, path, &disk_key, 1);
+	fixup_low_keys(root->fs_info, path, &disk_key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -4168,6 +4170,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int mid;
 	int slot;
 	struct extent_buffer *right;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret = 0;
 	int wret;
 	int split;
@@ -4271,10 +4274,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
-	write_extent_buffer(right, root->fs_info->fsid,
+	write_extent_buffer(right, fs_info->fsid,
 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
-	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+	write_extent_buffer(right, fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
@@ -4297,7 +4300,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			path->nodes[0] = right;
 			path->slots[0] = 0;
 			if (path->slots[1] == 0)
-				fixup_low_keys(root, path, &disk_key, 1);
+				fixup_low_keys(fs_info, path, &disk_key, 1);
 		}
 		btrfs_mark_buffer_dirty(right);
 		return ret;
@@ -4615,7 +4618,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
 		btrfs_set_item_key(leaf, &disk_key, slot);
 		if (slot == 0)
-			fixup_low_keys(root, path, &disk_key, 1);
+			fixup_low_keys(root->fs_info, path, &disk_key, 1);
 	}
 
 	item = btrfs_item_nr(slot);
@@ -4716,7 +4719,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
 	if (path->slots[0] == 0) {
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		fixup_low_keys(root, path, &disk_key, 1);
+		fixup_low_keys(root->fs_info, path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
 
@@ -4888,7 +4891,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 		struct btrfs_disk_key disk_key;
 
 		btrfs_node_key(parent, &disk_key, 0);
-		fixup_low_keys(root, path, &disk_key, level + 1);
+		fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
 	}
 	btrfs_mark_buffer_dirty(parent);
 }
@@ -4990,7 +4993,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_item_key(leaf, &disk_key, 0);
-			fixup_low_keys(root, path, &disk_key, 1);
+			fixup_low_keys(root->fs_info, path, &disk_key, 1);
 		}
 
 		/* delete the leaf if it is mostly empty */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b3dd55f52f71..216056c37940 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3483,7 +3483,8 @@ int btrfs_previous_item(struct btrfs_root *root,
 			int type);
 int btrfs_previous_extent_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path,
 			     struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 84a2d1868271..fc003321bdd4 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
 		btrfs_truncate_item(root, path, new_size, 0);
 
 		key->offset = end_byte;
-		btrfs_set_item_key_safe(root, path, key);
+		btrfs_set_item_key_safe(root->fs_info, path, key);
 	} else {
 		BUG();
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..6b796f03de10 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -868,7 +868,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
-			btrfs_set_item_key_safe(root, path, &new_key);
+			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
 
 			extent_offset += end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1126,7 +1126,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
-			btrfs_set_item_key_safe(root, path, &new_key);
+			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
@@ -1160,7 +1160,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
-			btrfs_set_item_key_safe(root, path, &new_key);
+			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
@@ -2169,7 +2169,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 		u64 num_bytes;
 
 		key.offset = offset;
-		btrfs_set_item_key_safe(root, path, &key);
+		btrfs_set_item_key_safe(root->fs_info, path, &key);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -

From a585e94895ed8ab2930bfd5186ffd20301b5c703 Mon Sep 17 00:00:00 2001
From: Daniel Dressler <danieru.dressler@gmail.com>
Date: Mon, 17 Nov 2014 22:05:02 +0900
Subject: [PATCH 02/93] Btrfs: delayed-inode: replace root args iff only
 fs_info used

This is the second independent patch of a larger project to cleanup
btrfs's internal usage of btrfs_root. Many functions take btrfs_root
only to grab the fs_info struct.

By requiring a root these functions cause programmer overhead. That
these functions can accept any valid root is not obvious until
inspection.

This patch reduces the specificity of such functions to accept the
fs_info directly.

These patches can be applied independently and thus are not being
submitted as a patch series. There should be about 26 patches by the
project's completion. Each patch will cleanup between 1 and 34 functions
apiece.  Each patch covers a single file's functions.

This patch affects the following function(s):
  1) btrfs_wq_run_delayed_node

Signed-off-by: Daniel Dressler <danieru.dressler@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/delayed-inode.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 82f0c7c95474..cde698a07d21 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1383,7 +1383,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 
 
 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
-				     struct btrfs_root *root, int nr)
+				     struct btrfs_fs_info *fs_info, int nr)
 {
 	struct btrfs_async_delayed_work *async_work;
 
@@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 			btrfs_async_run_delayed_root, NULL, NULL);
 	async_work->nr = nr;
 
-	btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
+	btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
 	return 0;
 }
 
@@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
 	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
@@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
 
 		seq = atomic_read(&delayed_root->items_seq);
 
-		ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
+		ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
 		if (ret)
 			return;
 
@@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
 		return;
 	}
 
-	btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
+	btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
 }
 
 /* Will return 0 or -ENOMEM */

From 01d58472a887cba61da7b4e6dc251b5170b57e2e Mon Sep 17 00:00:00 2001
From: Daniel Dressler <danieru.dressler@gmail.com>
Date: Fri, 21 Nov 2014 17:15:07 +0900
Subject: [PATCH 03/93] Btrfs: disk-io: replace root args iff only fs_info used

This is the 3rd independent patch of a larger project to cleanup btrfs's
internal usage of btrfs_root. Many functions take btrfs_root only to
grab the fs_info struct.

By requiring a root these functions cause programmer overhead. That
these functions can accept any valid root is not obvious until
inspection.

This patch reduces the specificity of such functions to accept the
fs_info directly.

These patches can be applied independently and thus are not being
submitted as a patch series. There should be about 26 patches by the
project's completion. Each patch will cleanup between 1 and 34 functions
apiece.  Each patch covers a single file's functions.

This patch affects the following function(s):
  1) csum_tree_block
  2) csum_dirty_buffer
  3) check_tree_block_fsid
  4) btrfs_find_tree_block
  5) clean_tree_block

Signed-off-by: Daniel Dressler <danieru.dressler@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/ctree.c       | 26 +++++++++++++-------------
 fs/btrfs/disk-io.c     | 32 ++++++++++++++++----------------
 fs/btrfs/disk-io.h     |  4 ++--
 fs/btrfs/extent-tree.c |  6 +++---
 fs/btrfs/qgroup.c      |  2 +-
 fs/btrfs/tree-log.c    |  8 +++++---
 6 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8c03c9222154..e1a0981159ab 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1073,7 +1073,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			ret = btrfs_dec_ref(trans, root, buf, 1);
 			BUG_ON(ret); /* -ENOMEM */
 		}
-		clean_tree_block(trans, root, buf);
+		clean_tree_block(trans, root->fs_info, buf);
 		*last_ref = 1;
 	}
 	return 0;
@@ -1678,7 +1678,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
-		cur = btrfs_find_tree_block(root, blocknr);
+		cur = btrfs_find_tree_block(root->fs_info, blocknr);
 		if (cur)
 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
 		else
@@ -1943,7 +1943,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		path->locks[level] = 0;
 		path->nodes[level] = NULL;
-		clean_tree_block(trans, root, mid);
+		clean_tree_block(trans, root->fs_info, mid);
 		btrfs_tree_unlock(mid);
 		/* once for the path */
 		free_extent_buffer(mid);
@@ -1997,7 +1997,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
-			clean_tree_block(trans, root, right);
+			clean_tree_block(trans, root->fs_info, right);
 			btrfs_tree_unlock(right);
 			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
@@ -2041,7 +2041,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		BUG_ON(wret == 1);
 	}
 	if (btrfs_header_nritems(mid) == 0) {
-		clean_tree_block(trans, root, mid);
+		clean_tree_block(trans, root->fs_info, mid);
 		btrfs_tree_unlock(mid);
 		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
@@ -2259,7 +2259,7 @@ static void reada_for_search(struct btrfs_root *root,
 
 	search = btrfs_node_blockptr(node, slot);
 	blocksize = root->nodesize;
-	eb = btrfs_find_tree_block(root, search);
+	eb = btrfs_find_tree_block(root->fs_info, search);
 	if (eb) {
 		free_extent_buffer(eb);
 		return;
@@ -2319,7 +2319,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
 	if (slot > 0) {
 		block1 = btrfs_node_blockptr(parent, slot - 1);
 		gen = btrfs_node_ptr_generation(parent, slot - 1);
-		eb = btrfs_find_tree_block(root, block1);
+		eb = btrfs_find_tree_block(root->fs_info, block1);
 		/*
 		 * if we get -eagain from btrfs_buffer_uptodate, we
 		 * don't want to return eagain here.  That will loop
@@ -2332,7 +2332,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
 	if (slot + 1 < nritems) {
 		block2 = btrfs_node_blockptr(parent, slot + 1);
 		gen = btrfs_node_ptr_generation(parent, slot + 1);
-		eb = btrfs_find_tree_block(root, block2);
+		eb = btrfs_find_tree_block(root->fs_info, block2);
 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
 			block2 = 0;
 		free_extent_buffer(eb);
@@ -2450,7 +2450,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	blocknr = btrfs_node_blockptr(b, slot);
 	gen = btrfs_node_ptr_generation(b, slot);
 
-	tmp = btrfs_find_tree_block(root, blocknr);
+	tmp = btrfs_find_tree_block(root->fs_info, blocknr);
 	if (tmp) {
 		/* first we do an atomic uptodate check */
 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -3694,7 +3694,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	if (left_nritems)
 		btrfs_mark_buffer_dirty(left);
 	else
-		clean_tree_block(trans, root, left);
+		clean_tree_block(trans, root->fs_info, left);
 
 	btrfs_mark_buffer_dirty(right);
 
@@ -3706,7 +3706,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	if (path->slots[0] >= left_nritems) {
 		path->slots[0] -= left_nritems;
 		if (btrfs_header_nritems(path->nodes[0]) == 0)
-			clean_tree_block(trans, root, path->nodes[0]);
+			clean_tree_block(trans, root->fs_info, path->nodes[0]);
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = right;
@@ -3930,7 +3930,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 	else
-		clean_tree_block(trans, root, right);
+		clean_tree_block(trans, root->fs_info, right);
 
 	btrfs_item_key(right, &disk_key, 0);
 	fixup_low_keys(root->fs_info, path, &disk_key, 1);
@@ -4984,7 +4984,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			btrfs_set_header_level(leaf, 0);
 		} else {
 			btrfs_set_path_blocking(path);
-			clean_tree_block(trans, root, leaf);
+			clean_tree_block(trans, root->fs_info, leaf);
 			btrfs_del_leaf(trans, root, path, leaf);
 		}
 	} else {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b320e235d7..ed5dcfe52f91 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result)
  * compute the csum for a btree block, and either verify it or write it
  * into the csum field of the block.
  */
-static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *buf,
 			   int verify)
 {
-	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
@@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			printk_ratelimited(KERN_WARNING
 				"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
 				"level %d\n",
-				root->fs_info->sb->s_id, buf->start,
+				fs_info->sb->s_id, buf->start,
 				val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
 				kfree(result);
@@ -501,7 +502,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
  * we only fill in the checksum field in the first page of a multi-page block
  */
 
-static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 {
 	u64 start = page_offset(page);
 	u64 found_start;
@@ -513,14 +514,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	found_start = btrfs_header_bytenr(eb);
 	if (WARN_ON(found_start != start || !PageUptodate(page)))
 		return 0;
-	csum_tree_block(root, eb, 0);
+	csum_tree_block(fs_info, eb, 0);
 	return 0;
 }
 
-static int check_tree_block_fsid(struct btrfs_root *root,
+static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 				 struct extent_buffer *eb)
 {
-	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	u8 fsid[BTRFS_UUID_SIZE];
 	int ret = 1;
 
@@ -640,7 +641,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 		ret = -EIO;
 		goto err;
 	}
-	if (check_tree_block_fsid(root, eb)) {
+	if (check_tree_block_fsid(root->fs_info, eb)) {
 		printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
 			       eb->fs_info->sb->s_id, eb->start);
 		ret = -EIO;
@@ -657,7 +658,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 				       eb, found_level);
 
-	ret = csum_tree_block(root, eb, 1);
+	ret = csum_tree_block(root->fs_info, eb, 1);
 	if (ret) {
 		ret = -EIO;
 		goto err;
@@ -882,7 +883,7 @@ static int btree_csum_one_bio(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
-		ret = csum_dirty_buffer(root, bvec->bv_page);
+		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
 		if (ret)
 			break;
 	}
@@ -1119,10 +1120,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 	return 0;
 }
 
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr)
 {
-	return find_extent_buffer(root->fs_info, bytenr);
+	return find_extent_buffer(fs_info, bytenr);
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -1165,11 +1166,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 }
 
-void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+void clean_tree_block(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info,
 		      struct extent_buffer *buf)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
 	if (btrfs_header_generation(buf) ==
 	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
@@ -4136,7 +4136,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 
 		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
 		while (start <= end) {
-			eb = btrfs_find_tree_block(root, start);
+			eb = btrfs_find_tree_block(root->fs_info, start);
 			start += root->nodesize;
 			if (!eb)
 				continue;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 27d44c0fd236..d4cbfeeeedd4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr);
 void clean_tree_block(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, struct extent_buffer *buf);
+		      struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options);
@@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 				      struct btrfs_key *location);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 28ce5c8004d4..44f7d490a76f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7168,7 +7168,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	btrfs_set_header_generation(buf, trans->transid);
 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
 	btrfs_tree_lock(buf);
-	clean_tree_block(trans, root, buf);
+	clean_tree_block(trans, root->fs_info, buf);
 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
 	btrfs_set_lock_blocking(buf);
@@ -7766,7 +7766,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
 	blocksize = root->nodesize;
 
-	next = btrfs_find_tree_block(root, bytenr);
+	next = btrfs_find_tree_block(root->fs_info, bytenr);
 	if (!next) {
 		next = btrfs_find_create_tree_block(root, bytenr);
 		if (!next)
@@ -7967,7 +7967,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			btrfs_set_lock_blocking(eb);
 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 		}
-		clean_tree_block(trans, root, eb);
+		clean_tree_block(trans, root->fs_info, eb);
 	}
 
 	if (eb == root->node) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 97159a8e91d4..50ee58614f72 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -982,7 +982,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	list_del(&quota_root->dirty_list);
 
 	btrfs_tree_lock(quota_root->node);
-	clean_tree_block(trans, tree_root, quota_root->node);
+	clean_tree_block(trans, tree_root->fs_info, quota_root->node);
 	btrfs_tree_unlock(quota_root->node);
 	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f96996a1b70c..4f59140c8c00 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2230,7 +2230,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				if (trans) {
 					btrfs_tree_lock(next);
 					btrfs_set_lock_blocking(next);
-					clean_tree_block(trans, root, next);
+					clean_tree_block(trans, root->fs_info,
+							next);
 					btrfs_wait_tree_block_writeback(next);
 					btrfs_tree_unlock(next);
 				}
@@ -2308,7 +2309,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				if (trans) {
 					btrfs_tree_lock(next);
 					btrfs_set_lock_blocking(next);
-					clean_tree_block(trans, root, next);
+					clean_tree_block(trans, root->fs_info,
+							next);
 					btrfs_wait_tree_block_writeback(next);
 					btrfs_tree_unlock(next);
 				}
@@ -2384,7 +2386,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			if (trans) {
 				btrfs_tree_lock(next);
 				btrfs_set_lock_blocking(next);
-				clean_tree_block(trans, log, next);
+				clean_tree_block(trans, log->fs_info, next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 			}

From f749303bda20392e1d4a10adda84524365afd0e6 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangshilong1991@gmail.com>
Date: Sat, 22 Nov 2014 21:13:10 +0800
Subject: [PATCH 04/93] Btrfs: switch to kvfree() helper

A new helper kvfree() in mm/utils.c will do this.

Signed-off-by: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/check-integrity.c |  5 +----
 fs/btrfs/raid56.c          | 13 +++----------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index d897ef803b3b..77ba1634c9ab 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root,
 
 	mutex_unlock(&btrfsic_mutex);
 
-	if (is_vmalloc_addr(state))
-		vfree(state);
-	else
-		kfree(state);
+	kvfree(state);
 }
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5264858ed768..1ccc40bb61f5 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 	}
 
 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
-	if (x) {
-		if (is_vmalloc_addr(x))
-			vfree(x);
-		else
-			kfree(x);
-	}
+	if (x)
+		kvfree(x);
 	return 0;
 }
 
@@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 	if (!info->stripe_hash_table)
 		return;
 	btrfs_clear_rbio_cache(info);
-	if (is_vmalloc_addr(info->stripe_hash_table))
-		vfree(info->stripe_hash_table);
-	else
-		kfree(info->stripe_hash_table);
+	kvfree(info->stripe_hash_table);
 	info->stripe_hash_table = NULL;
 }
 

From e8c9f18603f7ce2beca233401e228de730f121fa Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 2 Jan 2015 18:23:10 +0100
Subject: [PATCH 05/93] btrfs: constify structs with op functions or static
 definitions

There are some op tables that can be easily made const, similarly the
sysfs feature and raid tables. This is motivated by PaX CONSTIFY plugin.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/compression.c | 2 +-
 fs/btrfs/compression.h | 4 ++--
 fs/btrfs/disk-io.c     | 4 ++--
 fs/btrfs/extent_io.h   | 2 +-
 fs/btrfs/lzo.c         | 2 +-
 fs/btrfs/props.c       | 2 ++
 fs/btrfs/sysfs.c       | 2 +-
 fs/btrfs/transaction.c | 2 +-
 fs/btrfs/volumes.c     | 2 +-
 fs/btrfs/zlib.c        | 2 +-
 10 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e9df8862012c..e7a94f8cdfd7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
 static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
 static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
 
-static struct btrfs_compress_op *btrfs_compress_op[] = {
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
 };
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d181f70caae0..13a4dc0436c9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -77,7 +77,7 @@ struct btrfs_compress_op {
 			  size_t srclen, size_t destlen);
 };
 
-extern struct btrfs_compress_op btrfs_zlib_compress;
-extern struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
 
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ed5dcfe52f91..83671f847c7c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -54,7 +54,7 @@
 #include <asm/cpufeature.h>
 #endif
 
-static struct extent_io_ops btree_extent_io_ops;
+static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
@@ -4287,7 +4287,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 	return 0;
 }
 
-static struct extent_io_ops btree_extent_io_ops = {
+static const struct extent_io_ops btree_extent_io_ops = {
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 695b0ccfb755..c668f36898d3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -97,7 +97,7 @@ struct extent_io_tree {
 	u64 dirty_bytes;
 	int track_uptodate;
 	spinlock_t lock;
-	struct extent_io_ops *ops;
+	const struct extent_io_ops *ops;
 };
 
 struct extent_state {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 617553cdb7d3..a2f051347731 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -434,7 +434,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	return ret;
 }
 
-struct btrfs_compress_op btrfs_lzo_compress = {
+const struct btrfs_compress_op btrfs_lzo_compress = {
 	.alloc_workspace	= lzo_alloc_workspace,
 	.free_workspace		= lzo_free_workspace,
 	.compress_pages		= lzo_compress_pages,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 129b1dd28527..dca137b04095 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode)
 
 	return NULL;
 }
+
+
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 94edb0a2a026..e8a4c86d274d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -459,7 +459,7 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
 static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
 
-static u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[3] = {
 	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
 	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
 	[FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 038fcf6051e0..dde9d285308e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,7 +35,7 @@
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
-static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
 	[TRANS_STATE_RUNNING]		= 0U,
 	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
 					   __TRANS_START),
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cd4d1315aaa9..33ed9df08a9c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4131,7 +4131,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
-static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = {
 		.sub_stripes	= 2,
 		.dev_stripes	= 1,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index fb22fd8d8fb8..82990b8f872b 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -403,7 +403,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	return ret;
 }
 
-struct btrfs_compress_op btrfs_zlib_compress = {
+const struct btrfs_compress_op btrfs_zlib_compress = {
 	.alloc_workspace	= zlib_alloc_workspace,
 	.free_workspace		= zlib_free_workspace,
 	.compress_pages		= zlib_compress_pages,

From b9fd47cde5dd9725ca37769861b7424f40828492 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 9 Feb 2015 14:40:20 +0800
Subject: [PATCH 06/93] btrfs: cleanup: remove no-used alloc_chunk in
 btrfs_check_data_free_space()

int alloc_chunk is never used in this function, remove it.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 44f7d490a76f..262e16fe27d8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3599,7 +3599,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 used;
-	int ret = 0, committed = 0, alloc_chunk = 1;
+	int ret = 0, committed = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
@@ -3627,7 +3627,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
-		if (!data_sinfo->full && alloc_chunk) {
+		if (!data_sinfo->full) {
 			u64 alloc_target;
 
 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;

From a688a04aab72a775e07fe460f563b46f7791cfc1 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 9 Feb 2015 20:31:44 +0800
Subject: [PATCH 07/93] btrfs: remove unused chunk_tree argument in several
 functions

There functions include unused chunk_tree argument from the begining,
it is time to remove them and clean up relative code to prepare value
of this argument in caller.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/volumes.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 33ed9df08a9c..30c6074f3d00 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2487,8 +2487,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 }
 
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    u64 chunk_tree, u64 chunk_objectid,
+			    struct btrfs_root *root, u64 chunk_objectid,
 			    u64 chunk_offset)
 {
 	int ret;
@@ -2580,7 +2579,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 	struct map_lookup *map;
 	u64 dev_extent_len = 0;
 	u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-	u64 chunk_tree = root->fs_info->chunk_root->objectid;
 	int i, ret = 0;
 
 	/* Just in case */
@@ -2634,8 +2632,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 			}
 		}
 	}
-	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
-			       chunk_offset);
+	ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out;
@@ -2664,8 +2661,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 }
 
 static int btrfs_relocate_chunk(struct btrfs_root *root,
-			 u64 chunk_tree, u64 chunk_objectid,
-			 u64 chunk_offset)
+				u64 chunk_objectid,
+				u64 chunk_offset)
 {
 	struct btrfs_root *extent_root;
 	struct btrfs_trans_handle *trans;
@@ -2707,7 +2704,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 	struct btrfs_chunk *chunk;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	u64 chunk_tree = chunk_root->root_key.objectid;
 	u64 chunk_type;
 	bool retried = false;
 	int failed = 0;
@@ -2744,7 +2740,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
 		btrfs_release_path(path);
 
 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
-			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+			ret = btrfs_relocate_chunk(chunk_root,
 						   found_key.objectid,
 						   found_key.offset);
 			if (ret == -ENOSPC)
@@ -3255,7 +3251,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		}
 
 		ret = btrfs_relocate_chunk(chunk_root,
-					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
@@ -3957,7 +3952,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
 	u64 length;
-	u64 chunk_tree;
 	u64 chunk_objectid;
 	u64 chunk_offset;
 	int ret;
@@ -4027,13 +4021,11 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 			break;
 		}
 
-		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
 		btrfs_release_path(path);
 
-		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
-					   chunk_offset);
+		ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
 		if (ret && ret != -ENOSPC)
 			goto done;
 		if (ret == -ENOSPC)

From 08da757d3f29e9d33480ae1c44b557a2514d70c5 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 12 Feb 2015 15:42:16 +0800
Subject: [PATCH 08/93] btrfs: cleanup: use for() loop in btrfs_map_bio()

for() is obviously better in these code block, and remove noused
init-value to reduce about 6 bytes binary size.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/volumes.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 30c6074f3d00..d7d1dde5f2ff 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5820,8 +5820,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 length = 0;
 	u64 map_length;
 	int ret;
-	int dev_nr = 0;
-	int total_devs = 1;
+	int dev_nr;
+	int total_devs;
 	struct btrfs_bio *bbio = NULL;
 
 	length = bio->bi_iter.bi_size;
@@ -5862,11 +5862,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		BUG();
 	}
 
-	while (dev_nr < total_devs) {
+	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
 		dev = bbio->stripes[dev_nr].dev;
 		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
 			bbio_error(bbio, first_bio, logical);
-			dev_nr++;
 			continue;
 		}
 
@@ -5879,7 +5878,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
 						 dev_nr, rw, async_submit);
 			BUG_ON(ret);
-			dev_nr++;
 			continue;
 		}
 
@@ -5894,7 +5892,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		submit_stripe_bio(root, bbio, bio,
 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
 				  async_submit);
-		dev_nr++;
 	}
 	btrfs_bio_counter_dec(root->fs_info);
 	return 0;

From 41d6b13ea7b459cd51d4446c9b264128a52db993 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Sat, 14 Feb 2015 13:10:22 +0100
Subject: [PATCH 09/93] btrfs: fix sizeof format specifier in
 btrfs_check_super_valid()

This patch fixes mips compilation warning:

fs/btrfs/disk-io.c: In function 'btrfs_check_super_valid':
fs/btrfs/disk-io.c:3927:21: warning: format '%lu' expects argument
of type 'long unsigned int', but argument 3 has type 'unsigned int' [-Wformat]

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 83671f847c7c..6911aa4caa0d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3923,7 +3923,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	}
 	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
 			+ sizeof(struct btrfs_chunk)) {
-		printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+		printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
 				btrfs_super_sys_array_size(sb),
 				sizeof(struct btrfs_disk_key)
 				+ sizeof(struct btrfs_chunk));

From 9eaed21ef94b5da30eaf1b2503df1d41c2025175 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:35 -0500
Subject: [PATCH 10/93] btrfs: remove unused fs_info arg from
 btrfs_close_extra_devices()

The commit:
8dabb74 Btrfs: change core code of btrfs to support the
        device replace operations
added the fs_info argument, but never used it -
just remove it again.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 4 ++--
 fs/btrfs/volumes.c | 3 +--
 fs/btrfs/volumes.h | 3 +--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6911aa4caa0d..d91f08994d3c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2690,7 +2690,7 @@ int open_ctree(struct super_block *sb,
 	 * keep the device that is marked to be the target device for the
 	 * dev_replace procedure
 	 */
-	btrfs_close_extra_devices(fs_info, fs_devices, 0);
+	btrfs_close_extra_devices(fs_devices, 0);
 
 	if (!fs_devices->latest_bdev) {
 		printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
@@ -2794,7 +2794,7 @@ int open_ctree(struct super_block *sb,
 		goto fail_block_groups;
 	}
 
-	btrfs_close_extra_devices(fs_info, fs_devices, 1);
+	btrfs_close_extra_devices(fs_devices, 1);
 
 	ret = btrfs_sysfs_add_one(fs_info);
 	if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d7d1dde5f2ff..6d4fb246eb7e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -609,8 +609,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	return ERR_PTR(-ENOMEM);
 }
 
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
-			       struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
 {
 	struct btrfs_device *device, *next;
 	struct btrfs_device *latest_dev = NULL;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 83069dec6898..ebc31331a837 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -421,8 +421,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
-			       struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
 					 char *device_path,
 					 struct btrfs_device **device);

From 04892340777484351473f1756e6f870500c00a48 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:36 -0500
Subject: [PATCH 11/93] btrfs: consistently use fs_info in close_ctree()

close_ctree() has a local fs_info var for convienience;
use it consistently.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d91f08994d3c..6f37d9fc5d28 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3670,7 +3670,7 @@ void close_ctree(struct btrfs_root *root)
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_commit_super(root);
 		if (ret)
-			btrfs_err(root->fs_info, "commit super ret %d", ret);
+			btrfs_err(fs_info, "commit super ret %d", ret);
 	}
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
@@ -3682,10 +3682,10 @@ void close_ctree(struct btrfs_root *root)
 	fs_info->closing = 2;
 	smp_mb();
 
-	btrfs_free_qgroup_config(root->fs_info);
+	btrfs_free_qgroup_config(fs_info);
 
 	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
-		btrfs_info(root->fs_info, "at unmount delalloc count %lld",
+		btrfs_info(fs_info, "at unmount delalloc count %lld",
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 

From 638aa7ed4667f63ec45b264d1b6e259a79638733 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:38 -0500
Subject: [PATCH 12/93] btrfs: factor btrfs_init_scrub() out of open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[renamed to btrfs_init_scrub]
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f37d9fc5d28..2936a3958f3c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2147,6 +2147,17 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
 	}
 }
 
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+	mutex_init(&fs_info->scrub_lock);
+	atomic_set(&fs_info->scrubs_running, 0);
+	atomic_set(&fs_info->scrub_pause_req, 0);
+	atomic_set(&fs_info->scrubs_paused, 0);
+	atomic_set(&fs_info->scrub_cancel_req, 0);
+	init_waitqueue_head(&fs_info->scrub_pause_wait);
+	fs_info->scrub_workers_refcnt = 0;
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2295,14 +2306,8 @@ int open_ctree(struct super_block *sb,
 	}
 	btrfs_init_delayed_root(fs_info->delayed_root);
 
-	mutex_init(&fs_info->scrub_lock);
-	atomic_set(&fs_info->scrubs_running, 0);
-	atomic_set(&fs_info->scrub_pause_req, 0);
-	atomic_set(&fs_info->scrubs_paused, 0);
-	atomic_set(&fs_info->scrub_cancel_req, 0);
+	btrfs_init_scrub(fs_info);
 	init_waitqueue_head(&fs_info->replace_wait);
-	init_waitqueue_head(&fs_info->scrub_pause_wait);
-	fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	fs_info->check_integrity_print_mask = 0;
 #endif

From 779a65a495b76aee7fae500ac7ae360dc8fd81ff Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:39 -0500
Subject: [PATCH 13/93] btrfs: factor btrfs_init_balance() out of open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[renamed to btrfs_init_balance]
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2936a3958f3c..b95b033b2494 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2158,6 +2158,17 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
 	fs_info->scrub_workers_refcnt = 0;
 }
 
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+	spin_lock_init(&fs_info->balance_lock);
+	mutex_init(&fs_info->balance_mutex);
+	atomic_set(&fs_info->balance_running, 0);
+	atomic_set(&fs_info->balance_pause_req, 0);
+	atomic_set(&fs_info->balance_cancel_req, 0);
+	fs_info->balance_ctl = NULL;
+	init_waitqueue_head(&fs_info->balance_wait_q);
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2311,14 +2322,7 @@ int open_ctree(struct super_block *sb,
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	fs_info->check_integrity_print_mask = 0;
 #endif
-
-	spin_lock_init(&fs_info->balance_lock);
-	mutex_init(&fs_info->balance_mutex);
-	atomic_set(&fs_info->balance_running, 0);
-	atomic_set(&fs_info->balance_pause_req, 0);
-	atomic_set(&fs_info->balance_cancel_req, 0);
-	fs_info->balance_ctl = NULL;
-	init_waitqueue_head(&fs_info->balance_wait_q);
+	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
 
 	sb->s_blocksize = 4096;

From f37938e0e22f90740cfbfe74ecd9aad5e8369b9d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:40 -0500
Subject: [PATCH 14/93] btrfs: factor btrfs_init_btree_inode() out of
 open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[renamed to btrfs_init_btree_inode]
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 56 +++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b95b033b2494..6a64e666de05 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2169,6 +2169,36 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 	init_waitqueue_head(&fs_info->balance_wait_q);
 }
 
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
+				   struct btrfs_root *tree_root)
+{
+	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+	set_nlink(fs_info->btree_inode, 1);
+	/*
+	 * we set the i_size on the btree inode to the max possible int.
+	 * the real end of the address space is determined by all of
+	 * the devices in the system
+	 */
+	fs_info->btree_inode->i_size = OFFSET_MAX;
+	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+
+	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+			     fs_info->btree_inode->i_mapping);
+	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
+	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+	       sizeof(struct btrfs_key));
+	set_bit(BTRFS_INODE_DUMMY,
+		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
+	btrfs_insert_inode_hash(fs_info->btree_inode);
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2329,31 +2359,7 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
 
-	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
-	set_nlink(fs_info->btree_inode, 1);
-	/*
-	 * we set the i_size on the btree inode to the max possible int.
-	 * the real end of the address space is determined by all of
-	 * the devices in the system
-	 */
-	fs_info->btree_inode->i_size = OFFSET_MAX;
-	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
-
-	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
-	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-			     fs_info->btree_inode->i_mapping);
-	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
-	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-
-	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
-
-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
-	       sizeof(struct btrfs_key));
-	set_bit(BTRFS_INODE_DUMMY,
-		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
-	btrfs_insert_inode_hash(fs_info->btree_inode);
+	btrfs_init_btree_inode(fs_info, tree_root);
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;

From ad6183680c0845ff8052a919b97c71b4fec8f33a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:41 -0500
Subject: [PATCH 15/93] btrfs: factor btrfs_init_dev_replace_locks() out of
 open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[renamed to btrfs_init_dev_replace_locks]
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a64e666de05..2b905a76a45e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2199,6 +2199,16 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
 	btrfs_insert_inode_hash(fs_info->btree_inode);
 }
 
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+	fs_info->dev_replace.lock_owner = 0;
+	atomic_set(&fs_info->dev_replace.nesting_level, 0);
+	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+	mutex_init(&fs_info->dev_replace.lock_management_lock);
+	mutex_init(&fs_info->dev_replace.lock);
+	init_waitqueue_head(&fs_info->replace_wait);
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2348,7 +2358,6 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_delayed_root(fs_info->delayed_root);
 
 	btrfs_init_scrub(fs_info);
-	init_waitqueue_head(&fs_info->replace_wait);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	fs_info->check_integrity_print_mask = 0;
 #endif
@@ -2384,11 +2393,8 @@ int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
 	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
-	fs_info->dev_replace.lock_owner = 0;
-	atomic_set(&fs_info->dev_replace.nesting_level, 0);
-	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
-	mutex_init(&fs_info->dev_replace.lock_management_lock);
-	mutex_init(&fs_info->dev_replace.lock);
+
+	btrfs_init_dev_replace_locks(fs_info);
 
 	spin_lock_init(&fs_info->qgroup_lock);
 	mutex_init(&fs_info->qgroup_ioctl_lock);

From f9e92e40b517bfe94d9e961ff484ab885348f322 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:42 -0500
Subject: [PATCH 16/93] btrfs: factor btrfs_init_qgroup() out of open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[renamed to btrfs_init_qgroup]
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b905a76a45e..1d8db0bb0d4f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2209,6 +2209,20 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
 	init_waitqueue_head(&fs_info->replace_wait);
 }
 
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+	spin_lock_init(&fs_info->qgroup_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
+	fs_info->qgroup_tree = RB_ROOT;
+	fs_info->qgroup_op_tree = RB_ROOT;
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	fs_info->qgroup_seq = 1;
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	fs_info->qgroup_ulist = NULL;
+	mutex_init(&fs_info->qgroup_rescan_lock);
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2395,17 +2409,7 @@ int open_ctree(struct super_block *sb,
 	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
 
 	btrfs_init_dev_replace_locks(fs_info);
-
-	spin_lock_init(&fs_info->qgroup_lock);
-	mutex_init(&fs_info->qgroup_ioctl_lock);
-	fs_info->qgroup_tree = RB_ROOT;
-	fs_info->qgroup_op_tree = RB_ROOT;
-	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
-	fs_info->qgroup_seq = 1;
-	fs_info->quota_enabled = 0;
-	fs_info->pending_quota_state = 0;
-	fs_info->qgroup_ulist = NULL;
-	mutex_init(&fs_info->qgroup_rescan_lock);
+	btrfs_init_qgroup(fs_info);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);

From 2a4581983f90d829d9aeaa4cb881031950f54937 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 16 Feb 2015 16:29:26 +0100
Subject: [PATCH 17/93] btrfs: factor btrfs_init_workqueues() out of
 open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[renamed to btrfs_init_workqueues]
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 153 ++++++++++++++++++++++++---------------------
 1 file changed, 83 insertions(+), 70 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1d8db0bb0d4f..9ecd19d98f89 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2223,6 +2223,86 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 	mutex_init(&fs_info->qgroup_rescan_lock);
 }
 
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+		struct btrfs_fs_devices *fs_devices)
+{
+	int max_active = fs_info->thread_pool_size;
+	int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+
+	fs_info->workers =
+		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+				      max_active, 16);
+
+	fs_info->delalloc_workers =
+		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+
+	fs_info->flush_workers =
+		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+
+	fs_info->caching_workers =
+		btrfs_alloc_workqueue("cache", flags, max_active, 0);
+
+	/*
+	 * a higher idle thresh on the submit workers makes it much more
+	 * likely that bios will be send down in a sane order to the
+	 * devices
+	 */
+	fs_info->submit_workers =
+		btrfs_alloc_workqueue("submit", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 64);
+
+	fs_info->fixup_workers =
+		btrfs_alloc_workqueue("fixup", flags, 1, 0);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers =
+		btrfs_alloc_workqueue("endio", flags, max_active, 4);
+	fs_info->endio_meta_workers =
+		btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+	fs_info->endio_meta_write_workers =
+		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+	fs_info->endio_raid56_workers =
+		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+	fs_info->endio_repair_workers =
+		btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+	fs_info->rmw_workers =
+		btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+	fs_info->endio_write_workers =
+		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+	fs_info->endio_freespace_worker =
+		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+	fs_info->delayed_workers =
+		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+	fs_info->readahead_workers =
+		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+	fs_info->qgroup_rescan_workers =
+		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+	fs_info->extent_workers =
+		btrfs_alloc_workqueue("extent-refs", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 8);
+
+	if (!(fs_info->workers && fs_info->delalloc_workers &&
+	      fs_info->submit_workers && fs_info->flush_workers &&
+	      fs_info->endio_workers && fs_info->endio_meta_workers &&
+	      fs_info->endio_meta_write_workers &&
+	      fs_info->endio_repair_workers &&
+	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+	      fs_info->caching_workers && fs_info->readahead_workers &&
+	      fs_info->fixup_workers && fs_info->delayed_workers &&
+	      fs_info->extent_workers &&
+	      fs_info->qgroup_rescan_workers)) {
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2249,7 +2329,6 @@ int open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 	int max_active;
-	int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 	bool create_uuid_tree;
 	bool check_uuid_tree;
 
@@ -2581,75 +2660,9 @@ int open_ctree(struct super_block *sb,
 
 	max_active = fs_info->thread_pool_size;
 
-	fs_info->workers =
-		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
-				      max_active, 16);
-
-	fs_info->delalloc_workers =
-		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
-
-	fs_info->flush_workers =
-		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
-
-	fs_info->caching_workers =
-		btrfs_alloc_workqueue("cache", flags, max_active, 0);
-
-	/*
-	 * a higher idle thresh on the submit workers makes it much more
-	 * likely that bios will be send down in a sane order to the
-	 * devices
-	 */
-	fs_info->submit_workers =
-		btrfs_alloc_workqueue("submit", flags,
-				      min_t(u64, fs_devices->num_devices,
-					    max_active), 64);
-
-	fs_info->fixup_workers =
-		btrfs_alloc_workqueue("fixup", flags, 1, 0);
-
-	/*
-	 * endios are largely parallel and should have a very
-	 * low idle thresh
-	 */
-	fs_info->endio_workers =
-		btrfs_alloc_workqueue("endio", flags, max_active, 4);
-	fs_info->endio_meta_workers =
-		btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
-	fs_info->endio_meta_write_workers =
-		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
-	fs_info->endio_raid56_workers =
-		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
-	fs_info->endio_repair_workers =
-		btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
-	fs_info->rmw_workers =
-		btrfs_alloc_workqueue("rmw", flags, max_active, 2);
-	fs_info->endio_write_workers =
-		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
-	fs_info->endio_freespace_worker =
-		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
-	fs_info->delayed_workers =
-		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
-	fs_info->readahead_workers =
-		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
-	fs_info->qgroup_rescan_workers =
-		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
-	fs_info->extent_workers =
-		btrfs_alloc_workqueue("extent-refs", flags,
-				      min_t(u64, fs_devices->num_devices,
-					    max_active), 8);
-
-	if (!(fs_info->workers && fs_info->delalloc_workers &&
-	      fs_info->submit_workers && fs_info->flush_workers &&
-	      fs_info->endio_workers && fs_info->endio_meta_workers &&
-	      fs_info->endio_meta_write_workers &&
-	      fs_info->endio_repair_workers &&
-	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
-	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-	      fs_info->caching_workers && fs_info->readahead_workers &&
-	      fs_info->fixup_workers && fs_info->delayed_workers &&
-	      fs_info->extent_workers &&
-	      fs_info->qgroup_rescan_workers)) {
-		err = -ENOMEM;
+	ret = btrfs_init_workqueues(fs_info, fs_devices);
+	if (ret) {
+		err = ret;
 		goto fail_sb_buffer;
 	}
 

From 63443bf54a746fada8ef2829148a29f28f07f7af Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:46 -0500
Subject: [PATCH 18/93] btrfs: factor btrfs_replay_log() out of open_ctree()

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 93 ++++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ecd19d98f89..954f0658dd0e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2303,6 +2303,57 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+			    struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *log_tree_root;
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	u64 bytenr = btrfs_super_log_root(disk_super);
+
+	if (fs_devices->rw_devices == 0) {
+		printk(KERN_WARNING "BTRFS: log replay required "
+		       "on RO media\n");
+		return -EIO;
+	}
+
+	log_tree_root = btrfs_alloc_root(fs_info);
+	if (!log_tree_root)
+		return -ENOMEM;
+
+	__setup_root(tree_root->nodesize, tree_root->sectorsize,
+			tree_root->stripesize, log_tree_root, fs_info,
+			BTRFS_TREE_LOG_OBJECTID);
+
+	log_tree_root->node = read_tree_block(tree_root, bytenr,
+			fs_info->generation + 1);
+	if (!log_tree_root->node ||
+	    !extent_buffer_uptodate(log_tree_root->node)) {
+		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		free_extent_buffer(log_tree_root->node);
+		kfree(log_tree_root);
+		return -EIO;
+	}
+	/* returns with log_tree_root freed on success */
+	ret = btrfs_recover_log_trees(log_tree_root);
+	if (ret) {
+		btrfs_error(tree_root->fs_info, ret,
+			    "Failed to recover log tree");
+		free_extent_buffer(log_tree_root->node);
+		kfree(log_tree_root);
+		return ret;
+	}
+
+	if (fs_info->sb->s_flags & MS_RDONLY) {
+		ret = btrfs_commit_super(tree_root);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2323,7 +2374,6 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_root *dev_root;
 	struct btrfs_root *quota_root;
 	struct btrfs_root *uuid_root;
-	struct btrfs_root *log_tree_root;
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
@@ -2904,48 +2954,11 @@ int open_ctree(struct super_block *sb,
 
 	/* do not make disk changes in broken FS */
 	if (btrfs_super_log_root(disk_super) != 0) {
-		u64 bytenr = btrfs_super_log_root(disk_super);
-
-		if (fs_devices->rw_devices == 0) {
-			printk(KERN_WARNING "BTRFS: log replay required "
-			       "on RO media\n");
-			err = -EIO;
-			goto fail_qgroup;
-		}
-
-		log_tree_root = btrfs_alloc_root(fs_info);
-		if (!log_tree_root) {
-			err = -ENOMEM;
-			goto fail_qgroup;
-		}
-
-		__setup_root(nodesize, sectorsize, stripesize,
-			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
-		log_tree_root->node = read_tree_block(tree_root, bytenr,
-						      generation + 1);
-		if (!log_tree_root->node ||
-		    !extent_buffer_uptodate(log_tree_root->node)) {
-			printk(KERN_ERR "BTRFS: failed to read log tree\n");
-			free_extent_buffer(log_tree_root->node);
-			kfree(log_tree_root);
-			goto fail_qgroup;
-		}
-		/* returns with log_tree_root freed on success */
-		ret = btrfs_recover_log_trees(log_tree_root);
+		ret = btrfs_replay_log(fs_info, fs_devices);
 		if (ret) {
-			btrfs_error(tree_root->fs_info, ret,
-				    "Failed to recover log tree");
-			free_extent_buffer(log_tree_root->node);
-			kfree(log_tree_root);
+			err = ret;
 			goto fail_qgroup;
 		}
-
-		if (sb->s_flags & MS_RDONLY) {
-			ret = btrfs_commit_super(tree_root);
-			if (ret)
-				goto fail_qgroup;
-		}
 	}
 
 	ret = btrfs_find_orphan_roots(tree_root);

From 4bbcaa648d23470f04285c52a27d87c8fe906203 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 Aug 2014 18:12:45 -0500
Subject: [PATCH 19/93] btrfs: factor btrfs_read_roots() out of open_ctree()

Also, remove the two local variables create_uuid_tree
and check_uuid_tree; we can use the existence of
the uuid root and/or the RESCAN_UUID_TREE flag to
determine what action to take.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 131 +++++++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 65 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 954f0658dd0e..c7546d6512cc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2354,6 +2354,65 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
+			    struct btrfs_root *tree_root)
+{
+	struct btrfs_root *extent_root;
+	struct btrfs_root *dev_root;
+	struct btrfs_root *csum_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
+	struct btrfs_key location;
+	int ret;
+
+	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = 0;
+
+	extent_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(extent_root))
+		return PTR_ERR(extent_root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
+	fs_info->extent_root = extent_root;
+
+	location.objectid = BTRFS_DEV_TREE_OBJECTID;
+	dev_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(dev_root))
+		return PTR_ERR(dev_root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
+	fs_info->dev_root = dev_root;
+	btrfs_init_devices_late(fs_info);
+
+	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+	csum_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(csum_root))
+		return PTR_ERR(csum_root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
+	fs_info->csum_root = csum_root;
+
+	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+	quota_root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(quota_root)) {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
+		fs_info->quota_enabled = 1;
+		fs_info->pending_quota_state = 1;
+		fs_info->quota_root = quota_root;
+	}
+
+	location.objectid = BTRFS_UUID_TREE_OBJECTID;
+	uuid_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(uuid_root)) {
+		ret = PTR_ERR(uuid_root);
+		if (ret != -ENOENT)
+			return ret;
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
+		fs_info->uuid_root = uuid_root;
+	}
+
+	return 0;
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2368,19 +2427,12 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_super_block *disk_super;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *tree_root;
-	struct btrfs_root *extent_root;
-	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
-	struct btrfs_root *dev_root;
-	struct btrfs_root *quota_root;
-	struct btrfs_root *uuid_root;
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
 	int max_active;
-	bool create_uuid_tree;
-	bool check_uuid_tree;
 
 	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
@@ -2804,61 +2856,9 @@ int open_ctree(struct super_block *sb,
 	tree_root->commit_root = btrfs_root_node(tree_root);
 	btrfs_set_root_refs(&tree_root->root_item, 1);
 
-	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
-	location.type = BTRFS_ROOT_ITEM_KEY;
-	location.offset = 0;
-
-	extent_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(extent_root)) {
-		ret = PTR_ERR(extent_root);
+	ret = btrfs_read_roots(fs_info, tree_root);
+	if (ret)
 		goto recovery_tree_root;
-	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
-	fs_info->extent_root = extent_root;
-
-	location.objectid = BTRFS_DEV_TREE_OBJECTID;
-	dev_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(dev_root)) {
-		ret = PTR_ERR(dev_root);
-		goto recovery_tree_root;
-	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
-	fs_info->dev_root = dev_root;
-	btrfs_init_devices_late(fs_info);
-
-	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
-	csum_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(csum_root)) {
-		ret = PTR_ERR(csum_root);
-		goto recovery_tree_root;
-	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
-	fs_info->csum_root = csum_root;
-
-	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
-	quota_root = btrfs_read_tree_root(tree_root, &location);
-	if (!IS_ERR(quota_root)) {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
-		fs_info->quota_enabled = 1;
-		fs_info->pending_quota_state = 1;
-		fs_info->quota_root = quota_root;
-	}
-
-	location.objectid = BTRFS_UUID_TREE_OBJECTID;
-	uuid_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(uuid_root)) {
-		ret = PTR_ERR(uuid_root);
-		if (ret != -ENOENT)
-			goto recovery_tree_root;
-		create_uuid_tree = true;
-		check_uuid_tree = false;
-	} else {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
-		fs_info->uuid_root = uuid_root;
-		create_uuid_tree = false;
-		check_uuid_tree =
-		    generation != btrfs_super_uuid_tree_generation(disk_super);
-	}
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
@@ -2896,7 +2896,7 @@ int open_ctree(struct super_block *sb,
 		goto fail_sysfs;
 	}
 
-	ret = btrfs_read_block_groups(extent_root);
+	ret = btrfs_read_block_groups(fs_info->extent_root);
 	if (ret) {
 		printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
 		goto fail_sysfs;
@@ -3019,7 +3019,7 @@ int open_ctree(struct super_block *sb,
 
 	btrfs_qgroup_rescan_resume(fs_info);
 
-	if (create_uuid_tree) {
+	if (!fs_info->uuid_root) {
 		pr_info("BTRFS: creating UUID tree\n");
 		ret = btrfs_create_uuid_tree(fs_info);
 		if (ret) {
@@ -3028,8 +3028,9 @@ int open_ctree(struct super_block *sb,
 			close_ctree(tree_root);
 			return ret;
 		}
-	} else if (check_uuid_tree ||
-		   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+	} else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+		   fs_info->generation !=
+				btrfs_super_uuid_tree_generation(disk_super)) {
 		pr_info("BTRFS: checking UUID tree\n");
 		ret = btrfs_check_uuid_tree(fs_info);
 		if (ret) {

From 6f0110581911623df08bf4b65fdef4548ebcda0a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 16 Feb 2015 18:34:01 +0100
Subject: [PATCH 20/93] btrfs: use correct type for workqueue flags

Through all the local wrappers to alloc_workqueue, __alloc_workqueue_key
takes an unsigned int.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/async-thread.c | 4 ++--
 fs/btrfs/async-thread.h | 2 +-
 fs/btrfs/disk-io.c      | 2 +-
 fs/btrfs/scrub.c        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4dabeb893b7c..df9932b00d08 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -87,7 +87,7 @@ BTRFS_WORK_HELPER(scrubwrc_helper);
 BTRFS_WORK_HELPER(scrubnc_helper);
 
 static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
 			 int thresh)
 {
 	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -132,7 +132,7 @@ static inline void
 __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
-					      int flags,
+					      unsigned int flags,
 					      int max_active,
 					      int thresh)
 {
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e386c29ef1f6..ec2ee477f8ba 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -66,7 +66,7 @@ BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
 BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
-					      int flags,
+					      unsigned int flags,
 					      int max_active,
 					      int thresh);
 void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c7546d6512cc..29cb50b4981e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2227,7 +2227,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 		struct btrfs_fs_devices *fs_devices)
 {
 	int max_active = fs_info->thread_pool_size;
-	int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 
 	fs_info->workers =
 		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index db21f17df996..4af5f49a3105 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3561,7 +3561,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
 	int ret = 0;
-	int flags = WQ_FREEZABLE | WQ_UNBOUND;
+	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
 
 	if (fs_info->scrub_workers_refcnt == 0) {

From a4f3d2c4efe2628329249b64fd5799468e025b9d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 16 Feb 2015 18:44:34 +0100
Subject: [PATCH 21/93] btrfs: cleanup, reduce temporary variables in
 btrfs_read_roots

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 54 +++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 29cb50b4981e..f770e8b5cb86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2357,11 +2357,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
 			    struct btrfs_root *tree_root)
 {
-	struct btrfs_root *extent_root;
-	struct btrfs_root *dev_root;
-	struct btrfs_root *csum_root;
-	struct btrfs_root *quota_root;
-	struct btrfs_root *uuid_root;
+	struct btrfs_root *root;
 	struct btrfs_key location;
 	int ret;
 
@@ -2369,45 +2365,45 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = 0;
 
-	extent_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(extent_root))
-		return PTR_ERR(extent_root);
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
-	fs_info->extent_root = extent_root;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+	fs_info->extent_root = root;
 
 	location.objectid = BTRFS_DEV_TREE_OBJECTID;
-	dev_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(dev_root))
-		return PTR_ERR(dev_root);
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
-	fs_info->dev_root = dev_root;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+	fs_info->dev_root = root;
 	btrfs_init_devices_late(fs_info);
 
 	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
-	csum_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(csum_root))
-		return PTR_ERR(csum_root);
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
-	fs_info->csum_root = csum_root;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+	fs_info->csum_root = root;
 
 	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
-	quota_root = btrfs_read_tree_root(tree_root, &location);
-	if (!IS_ERR(quota_root)) {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(root)) {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 		fs_info->quota_enabled = 1;
 		fs_info->pending_quota_state = 1;
-		fs_info->quota_root = quota_root;
+		fs_info->quota_root = root;
 	}
 
 	location.objectid = BTRFS_UUID_TREE_OBJECTID;
-	uuid_root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(uuid_root)) {
-		ret = PTR_ERR(uuid_root);
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
 		if (ret != -ENOENT)
 			return ret;
 	} else {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
-		fs_info->uuid_root = uuid_root;
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->uuid_root = root;
 	}
 
 	return 0;

From 16068ec19f497bd80552332b4523150884888a7a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 16 Jan 2015 17:21:12 +0100
Subject: [PATCH 22/93] btrfs: cleanup 64bit/32bit divs, compile time constants

Switch to div_u64 if the divisor is a numeric constant or sum of
sizeof()s. We can remove a few instances of do_div that has the hidden
semtantics of changing the 1st argument.

Small power-of-two divisors are converted to bitshifts, large values are
kept intact for clarity.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/dev-replace.c      |  6 +++---
 fs/btrfs/disk-io.c          |  2 +-
 fs/btrfs/extent-tree.c      | 15 +++++++--------
 fs/btrfs/free-space-cache.c |  8 ++++----
 fs/btrfs/math.h             |  6 ++----
 fs/btrfs/super.c            |  2 +-
 6 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5ec03d999c37..0573848c7333 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 		srcdev = dev_replace->srcdev;
-		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
-			div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+		args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
 		break;
 	}
 	btrfs_dev_replace_unlock(dev_replace);
@@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data)
 		btrfs_dev_replace_status(fs_info, status_args);
 		progress = status_args->status.progress_1000;
 		kfree(status_args);
-		do_div(progress, 10);
+		progress = div_u64(progress, 10);
 		printk_in_rcu(KERN_INFO
 			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
 			dev_replace->srcdev->missing ? "<missing disk>" :
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b320e235d7..73b152138221 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2277,7 +2277,7 @@ int open_ctree(struct super_block *sb,
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-	fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
+	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
 	spin_lock_init(&fs_info->reada_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 28ce5c8004d4..2cb32bc45bcc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2561,8 +2561,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 */
 		spin_lock(&delayed_refs->lock);
 		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-		avg = div64_u64(avg, 4);
-		fs_info->avg_delayed_ref_runtime = avg;
+		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
 		spin_unlock(&delayed_refs->lock);
 	}
 	return 0;
@@ -2624,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
 	 * closer to what we're really going to want to ouse.
 	 */
-	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
 
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3193,7 +3192,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	struct inode *inode = NULL;
 	u64 alloc_hint = 0;
 	int dcs = BTRFS_DC_ERROR;
-	int num_pages = 0;
+	u64 num_pages = 0;
 	int retries = 0;
 	int ret = 0;
 
@@ -3277,7 +3276,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	 * taking up quite a bit since it's not folded into the other space
 	 * cache.
 	 */
-	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+	num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
 	if (!num_pages)
 		num_pages = 1;
 
@@ -4770,10 +4769,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
 		    csum_size * 2;
-	num_bytes += div64_u64(data_used + meta_used, 50);
+	num_bytes += div_u64(data_used + meta_used, 50);
 
 	if (num_bytes * 3 > meta_used)
-		num_bytes = div64_u64(meta_used, 3);
+		num_bytes = div_u64(meta_used, 3);
 
 	return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
 }
@@ -5039,7 +5038,7 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	else
 		BTRFS_I(inode)->csum_bytes -= num_bytes;
 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
-	num_csums_per_leaf = (int)div64_u64(csum_size,
+	num_csums_per_leaf = (int)div_u64(csum_size,
 					    sizeof(struct btrfs_csum_item) +
 					    sizeof(struct btrfs_disk_key));
 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a71978578fa7..43007c15dcba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1537,7 +1537,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 		max_bytes = MAX_CACHE_BYTES_PER_GIG;
 	else
 		max_bytes = MAX_CACHE_BYTES_PER_GIG *
-			div64_u64(size, 1024 * 1024 * 1024);
+			div_u64(size, 1024 * 1024 * 1024);
 
 	/*
 	 * we want to account for 1 more bitmap than what we have so we can make
@@ -1552,14 +1552,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	}
 
 	/*
-	 * we want the extent entry threshold to always be at most 1/2 the maxw
+	 * we want the extent entry threshold to always be at most 1/2 the max
 	 * bytes we can have, or whatever is less than that.
 	 */
 	extent_bytes = max_bytes - bitmap_bytes;
-	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
 
 	ctl->extents_thresh =
-		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
 }
 
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index b7816cefbd13..1b10a3cd1195 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor)
 	if (factor == 10)
 		return num;
 	num *= factor;
-	do_div(num, 10);
-	return num;
+	return div_u64(num, 10);
 }
 
 static inline u64 div_factor_fine(u64 num, int factor)
@@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor)
 	if (factor == 100)
 		return num;
 	num *= factor;
-	do_div(num, 100);
-	return num;
+	return div_u64(num, 100);
 }
 
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 100a0442c413..530a0baa7c71 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1704,7 +1704,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 		avail_space = device->total_bytes - device->bytes_used;
 
 		/* align with stripe_len */
-		do_div(avail_space, BTRFS_STRIPE_LEN);
+		avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
 		avail_space *= BTRFS_STRIPE_LEN;
 
 		/*

From c7abe829c04dc8ce0ecd1c07bf3bc97d2bb9655e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 16 Jan 2015 17:26:13 +0100
Subject: [PATCH 23/93] btrfs: cleanup 64bit/32bit divs, provably bounded
 values

The divisor is derived from nodesize or PAGE_SIZE, fits into 32bit type.
Get rid of a few more do_div instances.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c      |  4 ++--
 fs/btrfs/free-space-cache.c | 10 +++++-----
 fs/btrfs/ioctl.c            |  2 +-
 fs/btrfs/scrub.c            |  8 +++-----
 fs/btrfs/volumes.c          | 26 +++++++++++++-------------
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2cb32bc45bcc..4ccc3397c644 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5032,7 +5032,7 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	    BTRFS_I(inode)->csum_bytes == 0)
 		return 0;
 
-	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	old_csums = (int)div_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
 	if (reserve)
 		BTRFS_I(inode)->csum_bytes += num_bytes;
 	else
@@ -5041,7 +5041,7 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	num_csums_per_leaf = (int)div_u64(csum_size,
 					    sizeof(struct btrfs_csum_item) +
 					    sizeof(struct btrfs_disk_key));
-	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	num_csums = (int)div_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
 	num_csums = num_csums + num_csums_per_leaf - 1;
 	num_csums = num_csums / num_csums_per_leaf;
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 43007c15dcba..f371aac91a90 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1298,11 +1298,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
 				   u64 offset)
 {
 	u64 bitmap_start;
-	u64 bytes_per_bitmap;
+	u32 bytes_per_bitmap;
 
 	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
 	bitmap_start = offset - ctl->start;
-	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+	bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
 	bitmap_start *= bytes_per_bitmap;
 	bitmap_start += ctl->start;
 
@@ -1521,10 +1521,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	u64 bitmap_bytes;
 	u64 extent_bytes;
 	u64 size = block_group->key.offset;
-	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
-	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+	u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+	u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
-	max_bitmaps = max(max_bitmaps, 1);
+	max_bitmaps = max_t(u32, max_bitmaps, 1);
 
 	ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d49fe8a0f6b5..5544a79dc2ba 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1564,7 +1564,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		goto out_free;
 	}
 
-	do_div(new_size, root->sectorsize);
+	new_size = div_u64(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
 	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index db21f17df996..07be15d4ce7b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2624,9 +2624,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
 	for (i = 0; i < nr_data_stripes(map); i++) {
 		*offset = last_offset + i * map->stripe_len;
 
-		stripe_nr = *offset;
-		do_div(stripe_nr, map->stripe_len);
-		do_div(stripe_nr, nr_data_stripes(map));
+		stripe_nr = div_u64(*offset, map->stripe_len);
+		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
 
 		/* Work out the disk rotation on this stripe-set */
 		rot = do_div(stripe_nr, map->num_stripes);
@@ -2995,10 +2994,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	int extent_mirror_num;
 	int stop_loop = 0;
 
-	nstripes = length;
 	physical = map->stripes[num].physical;
 	offset = 0;
-	do_div(nstripes, map->stripe_len);
+	nstripes = div_u64(length, map->stripe_len);
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		offset = map->stripe_len * num;
 		increment = map->stripe_len * map->num_stripes;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8222f6f74147..36c9c1a89232 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3022,7 +3022,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
 
 		stripe_offset = btrfs_stripe_offset(leaf, stripe);
 		stripe_length = btrfs_chunk_length(leaf, chunk);
-		do_div(stripe_length, factor);
+		stripe_length = div_u64(stripe_length, factor);
 
 		if (stripe_offset < bargs->pend &&
 		    stripe_offset + stripe_length > bargs->pstart)
@@ -4400,8 +4400,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
 		u64 mask = (1ULL << 24) - 1;
-		stripe_size = max_chunk_size;
-		do_div(stripe_size, data_stripes);
+
+		stripe_size = div_u64(max_chunk_size, data_stripes);
 
 		/* bump the answer up to a 16MB boundary */
 		stripe_size = (stripe_size + mask) & ~mask;
@@ -4413,10 +4413,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			stripe_size = devices_info[ndevs-1].max_avail;
 	}
 
-	do_div(stripe_size, dev_stripes);
+	stripe_size = div_u64(stripe_size, dev_stripes);
 
 	/* align to BTRFS_STRIPE_LEN */
-	do_div(stripe_size, raid_stripe_len);
+	stripe_size = div_u64(stripe_size, raid_stripe_len);
 	stripe_size *= raid_stripe_len;
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -5136,7 +5136,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	stripe_index = 0;
 	stripe_nr_orig = stripe_nr;
 	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
-	do_div(stripe_nr_end, map->stripe_len);
+	stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
 	stripe_end_offset = stripe_nr_end * map->stripe_len -
 			    (offset + *length);
 
@@ -5198,8 +5198,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
 		     mirror_num > 1)) {
 			/* push stripe_nr back to the start of the full stripe */
-			stripe_nr = raid56_full_stripe_start;
-			do_div(stripe_nr, stripe_len * nr_data_stripes(map));
+			stripe_nr = div_u64(raid56_full_stripe_start,
+					stripe_len * nr_data_stripes(map));
 
 			/* RAID[56] write or recovery. Return all stripes */
 			num_stripes = map->num_stripes;
@@ -5535,11 +5535,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	rmap_len = map->stripe_len;
 
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
-		do_div(length, map->num_stripes / map->sub_stripes);
+		length = div_u64(length, map->num_stripes / map->sub_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
-		do_div(length, map->num_stripes);
+		length = div_u64(length, map->num_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		do_div(length, nr_data_stripes(map));
+		length = div_u64(length, nr_data_stripes(map));
 		rmap_len = map->stripe_len * nr_data_stripes(map);
 	}
 
@@ -5554,11 +5554,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			continue;
 
 		stripe_nr = physical - map->stripes[i].physical;
-		do_div(stripe_nr, map->stripe_len);
+		stripe_nr = div_u64(stripe_nr, map->stripe_len);
 
 		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
-			do_div(stripe_nr, map->sub_stripes);
+			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
 		} /* else if RAID[56], multiply by nr_data_stripes().

From 35b850f16d9adbb833775105ec942fbc818ee6a0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 20 Feb 2015 18:43:47 +0100
Subject: [PATCH 24/93] btrfs: replace remaining do_div calls with div_u64
 variants

Switch to div_u64_rem that does type checking and has more obvious
semantics than do_div.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c      |  2 +-
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/scrub.c            |  4 ++--
 fs/btrfs/volumes.c          | 31 ++++++++++++++++---------------
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ccc3397c644..5840afe5e5f9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8670,7 +8670,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		min_free <<= 1;
 	} else if (index == BTRFS_RAID_RAID0) {
 		dev_min = fs_devices->rw_devices;
-		do_div(min_free, dev_min);
+		min_free = div64_u64(min_free, dev_min);
 	}
 
 	/* We need to do this so that we can look at pending chunks */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f371aac91a90..740beed2aec5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1673,7 +1673,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		 */
 		if (*bytes >= align) {
 			tmp = entry->offset - ctl->start + align - 1;
-			do_div(tmp, align);
+			tmp = div64_u64(tmp, align);
 			tmp = tmp * align + ctl->start;
 			align_off = tmp - entry->offset;
 		} else {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 07be15d4ce7b..4cdf739c7a47 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2329,7 +2329,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
 	}
 
 	start -= sparity->logic_start;
-	offset = (int)do_div(start, sparity->stripe_len);
+	start = div_u64_rem(start, sparity->stripe_len, &offset);
 	offset /= sectorsize;
 	nsectors = (int)len / sectorsize;
 
@@ -2628,7 +2628,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
 		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
 
 		/* Work out the disk rotation on this stripe-set */
-		rot = do_div(stripe_nr, map->num_stripes);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
 		/* calculate which stripe this data locates */
 		rot += i;
 		stripe_index = rot % map->num_stripes;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 36c9c1a89232..e79eee9c127e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4995,7 +4995,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	do_div(stripe_nr, stripe_len);
+	stripe_nr = div64_u64(stripe_nr, stripe_len);
 
 	stripe_offset = stripe_nr * stripe_len;
 	BUG_ON(offset < stripe_offset);
@@ -5011,7 +5011,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		/* allow a write of a full stripe, but make sure we don't
 		 * allow straddling of stripes
 		 */
-		do_div(raid56_full_stripe_start, full_stripe_len);
+		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+				full_stripe_len);
 		raid56_full_stripe_start *= full_stripe_len;
 	}
 
@@ -5144,7 +5145,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		if (rw & REQ_DISCARD)
 			num_stripes = min_t(u64, map->num_stripes,
 					    stripe_nr_end - stripe_nr_orig);
-		stripe_index = do_div(stripe_nr, map->num_stripes);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+				&stripe_index);
 		if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
 			mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
@@ -5172,7 +5174,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 
-		stripe_index = do_div(stripe_nr, factor);
+		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
 		stripe_index *= map->sub_stripes;
 
 		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
@@ -5209,32 +5211,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			stripe_index = 0;
 			stripe_offset = 0;
 		} else {
-			u64 tmp;
-
 			/*
 			 * Mirror #0 or #1 means the original data block.
 			 * Mirror #2 is RAID5 parity block.
 			 * Mirror #3 is RAID6 Q block.
 			 */
-			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+			stripe_nr = div_u64_rem(stripe_nr,
+					nr_data_stripes(map), &stripe_index);
 			if (mirror_num > 1)
 				stripe_index = nr_data_stripes(map) +
 						mirror_num - 2;
 
 			/* We distribute the parity blocks across stripes */
-			tmp = stripe_nr + stripe_index;
-			stripe_index = do_div(tmp, map->num_stripes);
+			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+					&stripe_index);
 			if (!(rw & (REQ_WRITE | REQ_DISCARD |
 				    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
 				mirror_num = 1;
 		}
 	} else {
 		/*
-		 * after this do_div call, stripe_nr is the number of stripes
-		 * on this device we have to walk to find the data, and
-		 * stripe_index is the number of our device in the stripe array
+		 * after this, stripe_nr is the number of stripes on this
+		 * device we have to walk to find the data, and stripe_index is
+		 * the number of our device in the stripe array
 		 */
-		stripe_index = do_div(stripe_nr, map->num_stripes);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+				&stripe_index);
 		mirror_num = stripe_index + 1;
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
@@ -5269,8 +5271,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 				 sizeof(int) * tgtdev_indexes);
 
 		/* Work out the disk rotation on this stripe-set */
-		tmp = stripe_nr;
-		rot = do_div(tmp, num_stripes);
+		div_u64_rem(stripe_nr, num_stripes, &rot);
 
 		/* Fill in the logical address of each stripe */
 		tmp = stripe_nr * nr_data_stripes(map);

From 29cf342b8f1056486557015aba4538804a872dc1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 20 Feb 2015 18:42:11 +0100
Subject: [PATCH 25/93] btrfs: cleanup, use correct type in div_u64_rem

div_u64_rem expects u32 for divisior and reminder.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/scrub.c   |  6 +++---
 fs/btrfs/volumes.c | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4cdf739c7a47..5c3ca68b1fb4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2319,7 +2319,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
 				       unsigned long *bitmap,
 				       u64 start, u64 len)
 {
-	int offset;
+	u32 offset;
 	int nsectors;
 	int sectorsize = sparity->sctx->dev_root->sectorsize;
 
@@ -2612,8 +2612,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
 	int j = 0;
 	u64 stripe_nr;
 	u64 last_offset;
-	int stripe_index;
-	int rot;
+	u32 stripe_index;
+	u32 rot;
 
 	last_offset = (physical - map->stripes[num].physical) *
 		      nr_data_stripes(map);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e79eee9c127e..4b8917ee1fdf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4954,7 +4954,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
 	u64 stripe_len;
-	int stripe_index;
+	u32 stripe_index;
 	int i;
 	int ret = 0;
 	int num_stripes;
@@ -5172,7 +5172,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-		int factor = map->num_stripes / map->sub_stripes;
+		u32 factor = map->num_stripes / map->sub_stripes;
 
 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
 		stripe_index *= map->sub_stripes;
@@ -5263,7 +5263,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	    need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
 	    mirror_num > 1)) {
 		u64 tmp;
-		int i, rot;
+		int i;
+		unsigned rot;
 
 		bbio->raid_map = (u64 *)((void *)bbio->stripes +
 				 sizeof(struct btrfs_bio_stripe) *
@@ -5286,8 +5287,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	}
 
 	if (rw & REQ_DISCARD) {
-		int factor = 0;
-		int sub_stripes = 0;
+		u32 factor = 0;
+		u32 sub_stripes = 0;
 		u64 stripes_per_dev = 0;
 		u32 remaining_stripes = 0;
 		u32 last_stripe = 0;

From 853d8ec4b288ce52b49e12beeef0f3659ac423ce Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 8 Jan 2015 15:15:19 +0100
Subject: [PATCH 26/93] btrfs: need_resched not needed with cond_resched

Cleanup, no special reason to do

if (need_resched())
        cond_resched();

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/volumes.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8222f6f74147..50bb7b8b17e8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,8 +366,8 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 		btrfsic_submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
-		if (need_resched())
-			cond_resched();
+
+		cond_resched();
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -400,8 +400,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
-				if (need_resched())
-					cond_resched();
+				cond_resched();
 				continue;
 			}
 			spin_lock(&device->io_lock);

From 351810c1d2aafa288af61844d877941d516fb031 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 8 Jan 2015 15:20:54 +0100
Subject: [PATCH 27/93] btrfs: use cond_resched_lock where possible

Clean the opencoded variant, cond_resched_lock also checks the lock for
contention so it might help in some cases that were not covered by
simple need_resched().

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/file.c             |  6 +-----
 fs/btrfs/free-space-cache.c | 14 ++++----------
 fs/btrfs/transaction.c      |  7 ++-----
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..1e34bc00249f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 		defrag = rb_entry(node, struct inode_defrag, rb_node);
 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 
-		if (need_resched()) {
-			spin_unlock(&fs_info->defrag_inodes_lock);
-			cond_resched();
-			spin_lock(&fs_info->defrag_inodes_lock);
-		}
+		cond_resched_lock(&fs_info->defrag_inodes_lock);
 
 		node = rb_first(&fs_info->defrag_inodes);
 	}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a71978578fa7..96611fed552f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2402,11 +2402,8 @@ static void __btrfs_remove_free_space_cache_locked(
 		} else {
 			free_bitmap(ctl, info);
 		}
-		if (need_resched()) {
-			spin_unlock(&ctl->tree_lock);
-			cond_resched();
-			spin_lock(&ctl->tree_lock);
-		}
+
+		cond_resched_lock(&ctl->tree_lock);
 	}
 }
 
@@ -2431,11 +2428,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 
 		WARN_ON(cluster->block_group != block_group);
 		__btrfs_return_cluster_to_free_space(block_group, cluster);
-		if (need_resched()) {
-			spin_unlock(&ctl->tree_lock);
-			cond_resched();
-			spin_lock(&ctl->tree_lock);
-		}
+
+		cond_resched_lock(&ctl->tree_lock);
 	}
 	__btrfs_remove_free_space_cache_locked(ctl);
 	spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 038fcf6051e0..efc5ebffa7ea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -93,11 +93,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
 		 */
 		ASSERT(!waitqueue_active(&state->wq));
 		free_extent_state(state);
-		if (need_resched()) {
-			spin_unlock(&tree->lock);
-			cond_resched();
-			spin_lock(&tree->lock);
-		}
+
+		cond_resched_lock(&tree->lock);
 	}
 	spin_unlock(&tree->lock);
 }

From f8c269d7223f6b63cc5936eb191bc3b170d24342 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 16 Jan 2015 17:21:12 +0100
Subject: [PATCH 28/93] btrfs: cleanup 64bit/32bit divs, compile time constants

Switch to div_u64 if the divisor is a numeric constant or sum of
sizeof()s. We can remove a few instances of do_div that has the hidden
semtantics of changing the 1st argument.

Small power-of-two divisors are converted to bitshifts, large values are
kept intact for clarity.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/dev-replace.c      |  6 +++---
 fs/btrfs/disk-io.c          |  2 +-
 fs/btrfs/extent-tree.c      | 15 +++++++--------
 fs/btrfs/free-space-cache.c |  8 ++++----
 fs/btrfs/math.h             |  6 ++----
 fs/btrfs/super.c            |  2 +-
 6 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5ec03d999c37..0573848c7333 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 		srcdev = dev_replace->srcdev;
-		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
-			div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+		args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
 		break;
 	}
 	btrfs_dev_replace_unlock(dev_replace);
@@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data)
 		btrfs_dev_replace_status(fs_info, status_args);
 		progress = status_args->status.progress_1000;
 		kfree(status_args);
-		do_div(progress, 10);
+		progress = div_u64(progress, 10);
 		printk_in_rcu(KERN_INFO
 			"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
 			dev_replace->srcdev->missing ? "<missing disk>" :
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b320e235d7..73b152138221 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2277,7 +2277,7 @@ int open_ctree(struct super_block *sb,
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-	fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
+	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
 	spin_lock_init(&fs_info->reada_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 28ce5c8004d4..2cb32bc45bcc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2561,8 +2561,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 */
 		spin_lock(&delayed_refs->lock);
 		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-		avg = div64_u64(avg, 4);
-		fs_info->avg_delayed_ref_runtime = avg;
+		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
 		spin_unlock(&delayed_refs->lock);
 	}
 	return 0;
@@ -2624,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
 	 * closer to what we're really going to want to ouse.
 	 */
-	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
 
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3193,7 +3192,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	struct inode *inode = NULL;
 	u64 alloc_hint = 0;
 	int dcs = BTRFS_DC_ERROR;
-	int num_pages = 0;
+	u64 num_pages = 0;
 	int retries = 0;
 	int ret = 0;
 
@@ -3277,7 +3276,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	 * taking up quite a bit since it's not folded into the other space
 	 * cache.
 	 */
-	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+	num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
 	if (!num_pages)
 		num_pages = 1;
 
@@ -4770,10 +4769,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
 		    csum_size * 2;
-	num_bytes += div64_u64(data_used + meta_used, 50);
+	num_bytes += div_u64(data_used + meta_used, 50);
 
 	if (num_bytes * 3 > meta_used)
-		num_bytes = div64_u64(meta_used, 3);
+		num_bytes = div_u64(meta_used, 3);
 
 	return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
 }
@@ -5039,7 +5038,7 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	else
 		BTRFS_I(inode)->csum_bytes -= num_bytes;
 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
-	num_csums_per_leaf = (int)div64_u64(csum_size,
+	num_csums_per_leaf = (int)div_u64(csum_size,
 					    sizeof(struct btrfs_csum_item) +
 					    sizeof(struct btrfs_disk_key));
 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 96611fed552f..cb84923561e4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1537,7 +1537,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 		max_bytes = MAX_CACHE_BYTES_PER_GIG;
 	else
 		max_bytes = MAX_CACHE_BYTES_PER_GIG *
-			div64_u64(size, 1024 * 1024 * 1024);
+			div_u64(size, 1024 * 1024 * 1024);
 
 	/*
 	 * we want to account for 1 more bitmap than what we have so we can make
@@ -1552,14 +1552,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	}
 
 	/*
-	 * we want the extent entry threshold to always be at most 1/2 the maxw
+	 * we want the extent entry threshold to always be at most 1/2 the max
 	 * bytes we can have, or whatever is less than that.
 	 */
 	extent_bytes = max_bytes - bitmap_bytes;
-	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
 
 	ctl->extents_thresh =
-		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
 }
 
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index b7816cefbd13..1b10a3cd1195 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor)
 	if (factor == 10)
 		return num;
 	num *= factor;
-	do_div(num, 10);
-	return num;
+	return div_u64(num, 10);
 }
 
 static inline u64 div_factor_fine(u64 num, int factor)
@@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor)
 	if (factor == 100)
 		return num;
 	num *= factor;
-	do_div(num, 100);
-	return num;
+	return div_u64(num, 100);
 }
 
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 100a0442c413..530a0baa7c71 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1704,7 +1704,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 		avail_space = device->total_bytes - device->bytes_used;
 
 		/* align with stripe_len */
-		do_div(avail_space, BTRFS_STRIPE_LEN);
+		avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
 		avail_space *= BTRFS_STRIPE_LEN;
 
 		/*

From 31e818fe7375d60de9953051f7bd1615cebc3681 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 20 Feb 2015 18:00:26 +0100
Subject: [PATCH 29/93] btrfs: cleanup, use kmalloc_array/kcalloc array helpers

Convert kmalloc(nr * size, ..) to kmalloc_array that does additional
overflow checks, the zeroing variant is kcalloc.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/check-integrity.c  | 4 ++--
 fs/btrfs/compression.c      | 2 +-
 fs/btrfs/ctree.c            | 9 ++++-----
 fs/btrfs/disk-io.c          | 2 +-
 fs/btrfs/file-item.c        | 4 ++--
 fs/btrfs/file.c             | 2 +-
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/inode.c            | 2 +-
 fs/btrfs/raid56.c           | 3 +--
 fs/btrfs/scrub.c            | 5 ++---
 fs/btrfs/volumes.c          | 4 ++--
 11 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index d897ef803b3b..089d6fac01a7 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
 			       (unsigned long long)bio->bi_iter.bi_sector,
 			       dev_bytenr, bio->bi_bdev);
 
-		mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
-				       GFP_NOFS);
+		mapped_datav = kmalloc_array(bio->bi_vcnt,
+					     sizeof(*mapped_datav), GFP_NOFS);
 		if (!mapped_datav)
 			goto leave;
 		cur_bytenr = dev_bytenr;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e9df8862012c..ecacb7a2d422 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	cb->orig_bio = bio;
 
 	nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
-	cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
+	cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
 				       GFP_NOFS);
 	if (!cb->compressed_pages)
 		goto fail1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 993642199326..8595fdd50a22 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	if (!tree_mod_need_log(fs_info, eb))
 		return 0;
 
-	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
 	if (!tm_list)
 		return -ENOMEM;
 
@@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 
 	if (log_removal && btrfs_header_level(old_root) > 0) {
 		nritems = btrfs_header_nritems(old_root);
-		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 				  flags);
 		if (!tm_list) {
 			ret = -ENOMEM;
@@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 		return 0;
 
-	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 			  GFP_NOFS);
 	if (!tm_list)
 		return -ENOMEM;
@@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 		return 0;
 
 	nritems = btrfs_header_nritems(eb);
-	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
-			  GFP_NOFS);
+	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 	if (!tm_list)
 		return -ENOMEM;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73b152138221..14e4a3e0f9b2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -302,7 +302,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 		offset += cur_len;
 	}
 	if (csum_size > sizeof(inline_result)) {
-		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+		result = kzalloc(csum_size, GFP_NOFS);
 		if (!result)
 			return 1;
 	} else {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 84a2d1868271..3a4a7924fa38 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
 	if (!dst) {
 		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-			btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
-							    GFP_NOFS);
+			btrfs_bio->csum_allocated = kmalloc_array(nblocks,
+					csum_size, GFP_NOFS);
 			if (!btrfs_bio->csum_allocated) {
 				btrfs_free_path(path);
 				return -ENOMEM;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1e34bc00249f..e74abb3018d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1481,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			PAGE_CACHE_SIZE / (sizeof(struct page *)));
 	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
 	nrptrs = max(nrptrs, 8);
-	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb84923561e4..6e5d0abec6f9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -298,7 +298,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 
 	memset(io_ctl, 0, sizeof(struct io_ctl));
 
-	io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 	if (!io_ctl->pages)
 		return -ENOMEM;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8564d8ce03de..21e4affafa3e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -463,7 +463,7 @@ static noinline void compress_file_range(struct inode *inode,
 	 */
 	if (inode_need_compress(inode)) {
 		WARN_ON(pages);
-		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 		if (!pages) {
 			/* just bail out to the uncompressed code */
 			goto cont;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5264858ed768..b4634c30981e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1807,8 +1807,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	int err;
 	int i;
 
-	pointers = kzalloc(rbio->real_stripes * sizeof(void *),
-			   GFP_NOFS);
+	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
 	if (!pointers) {
 		err = -ENOMEM;
 		goto cleanup_io;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index db21f17df996..5a7d63cd9c25 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	 * the statistics.
 	 */
 
-	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
-				     sizeof(*sblocks_for_recheck),
-				     GFP_NOFS);
+	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
+				      sizeof(*sblocks_for_recheck), GFP_NOFS);
 	if (!sblocks_for_recheck) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50bb7b8b17e8..5b77982ab6b1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4288,7 +4288,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
-	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
 		return -ENOMEM;
@@ -5542,7 +5542,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		rmap_len = map->stripe_len * nr_data_stripes(map);
 	}
 
-	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
 	BUG_ON(!buf); /* -ENOMEM */
 
 	for (i = 0; i < map->num_stripes; i++) {

From f64c7b12f86c638f13e19de08eeb8cf888dff8f6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 24 Feb 2015 19:07:26 +0100
Subject: [PATCH 30/93] btrfs: remove shadowing variables in
 __btrfs_buffered_write

There are lockstart and lockend defined in the function and not used
after their duplicate definition scope ends, it's safe to reuse them.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e74abb3018d4..a3c2bd77c74b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1631,8 +1631,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			btrfs_end_write_no_snapshoting(root);
 
 		if (only_release_metadata && copied > 0) {
-			u64 lockstart = round_down(pos, root->sectorsize);
-			u64 lockend = lockstart +
+			lockstart = round_down(pos, root->sectorsize);
+			lockend = lockstart +
 				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
 
 			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,

From 3284da7b7b585e6e8e98f374a51d234d14c7a0a2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 25 Feb 2015 15:47:32 +0100
Subject: [PATCH 31/93] btrfs: use explicit initializer for seq_elem

Using {} as initializer for struct seq_elem does not properly initialize
the list_head member, but it currently works because it gets set through
btrfs_get_tree_mod_seq if 'seq' is 0.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/backref.c | 4 ++--
 fs/btrfs/ctree.h   | 2 ++
 fs/btrfs/ioctl.c   | 2 +-
 fs/btrfs/qgroup.c  | 6 +++---
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f55721ff9385..9de772ee0031 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1206,7 +1206,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
 	struct ulist *roots = NULL;
 	struct ulist_iterator uiter;
 	struct ulist_node *node;
-	struct seq_list elem = {};
+	struct seq_list elem = SEQ_LIST_INIT(elem);
 	int ret = 0;
 
 	tmp = ulist_alloc(GFP_NOFS);
@@ -1610,7 +1610,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 	struct ulist *roots = NULL;
 	struct ulist_node *ref_node = NULL;
 	struct ulist_node *root_node = NULL;
-	struct seq_list tree_mod_seq_elem = {};
+	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	struct ulist_iterator ref_uiter;
 	struct ulist_iterator root_uiter;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b3dd55f52f71..36e009ebab47 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1328,6 +1328,8 @@ struct seq_list {
 	u64 seq;
 };
 
+#define SEQ_LIST_INIT(name)	{ .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+
 enum btrfs_orphan_cleanup_state {
 	ORPHAN_CLEANUP_STARTED	= 1,
 	ORPHAN_CLEANUP_DONE	= 2,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d49fe8a0f6b5..0a9fe214deeb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3039,7 +3039,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     u64 disko)
 {
-	struct seq_list tree_mod_seq_elem = {};
+	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	struct ulist *roots;
 	struct ulist_iterator uiter;
 	struct ulist_node *root_node = NULL;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 97159a8e91d4..7f7eb399a347 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1845,7 +1845,7 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
 	struct ulist *roots = NULL;
 	struct ulist *qgroups, *tmp;
 	struct btrfs_qgroup *qgroup;
-	struct seq_list elem = {};
+	struct seq_list elem = SEQ_LIST_INIT(elem);
 	u64 seq;
 	int old_roots = 0;
 	int new_roots = 0;
@@ -1967,7 +1967,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
 	int err;
 	struct btrfs_qgroup *qg;
 	u64 root_obj = 0;
-	struct seq_list elem = {};
+	struct seq_list elem = SEQ_LIST_INIT(elem);
 
 	parents = ulist_alloc(GFP_NOFS);
 	if (!parents)
@@ -2522,7 +2522,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 {
 	struct btrfs_key found;
 	struct ulist *roots = NULL;
-	struct seq_list tree_mod_seq_elem = {};
+	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	u64 num_bytes;
 	u64 seq;
 	int new_roots;

From b8b93addde1e0192b045da8995e296fc1e40c80f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 16 Jan 2015 17:26:13 +0100
Subject: [PATCH 32/93] btrfs: cleanup 64bit/32bit divs, provably bounded
 values

The divisor is derived from nodesize or PAGE_SIZE, fits into 32bit type.
Get rid of a few more do_div instances.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c      |  4 ++--
 fs/btrfs/free-space-cache.c | 10 +++++-----
 fs/btrfs/ioctl.c            |  2 +-
 fs/btrfs/scrub.c            |  8 +++-----
 fs/btrfs/volumes.c          | 26 +++++++++++++-------------
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2cb32bc45bcc..4ccc3397c644 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5032,7 +5032,7 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	    BTRFS_I(inode)->csum_bytes == 0)
 		return 0;
 
-	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	old_csums = (int)div_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
 	if (reserve)
 		BTRFS_I(inode)->csum_bytes += num_bytes;
 	else
@@ -5041,7 +5041,7 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	num_csums_per_leaf = (int)div_u64(csum_size,
 					    sizeof(struct btrfs_csum_item) +
 					    sizeof(struct btrfs_disk_key));
-	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	num_csums = (int)div_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
 	num_csums = num_csums + num_csums_per_leaf - 1;
 	num_csums = num_csums / num_csums_per_leaf;
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6e5d0abec6f9..13c9b46d4727 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1298,11 +1298,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
 				   u64 offset)
 {
 	u64 bitmap_start;
-	u64 bytes_per_bitmap;
+	u32 bytes_per_bitmap;
 
 	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
 	bitmap_start = offset - ctl->start;
-	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+	bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
 	bitmap_start *= bytes_per_bitmap;
 	bitmap_start += ctl->start;
 
@@ -1521,10 +1521,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	u64 bitmap_bytes;
 	u64 extent_bytes;
 	u64 size = block_group->key.offset;
-	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
-	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+	u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+	u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
-	max_bitmaps = max(max_bitmaps, 1);
+	max_bitmaps = max_t(u32, max_bitmaps, 1);
 
 	ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0a9fe214deeb..4a5524c2713a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1564,7 +1564,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		goto out_free;
 	}
 
-	do_div(new_size, root->sectorsize);
+	new_size = div_u64(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
 	printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a7d63cd9c25..293262163daf 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2623,9 +2623,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
 	for (i = 0; i < nr_data_stripes(map); i++) {
 		*offset = last_offset + i * map->stripe_len;
 
-		stripe_nr = *offset;
-		do_div(stripe_nr, map->stripe_len);
-		do_div(stripe_nr, nr_data_stripes(map));
+		stripe_nr = div_u64(*offset, map->stripe_len);
+		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
 
 		/* Work out the disk rotation on this stripe-set */
 		rot = do_div(stripe_nr, map->num_stripes);
@@ -2994,10 +2993,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	int extent_mirror_num;
 	int stop_loop = 0;
 
-	nstripes = length;
 	physical = map->stripes[num].physical;
 	offset = 0;
-	do_div(nstripes, map->stripe_len);
+	nstripes = div_u64(length, map->stripe_len);
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		offset = map->stripe_len * num;
 		increment = map->stripe_len * map->num_stripes;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5b77982ab6b1..d712c46c0a6b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3021,7 +3021,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
 
 		stripe_offset = btrfs_stripe_offset(leaf, stripe);
 		stripe_length = btrfs_chunk_length(leaf, chunk);
-		do_div(stripe_length, factor);
+		stripe_length = div_u64(stripe_length, factor);
 
 		if (stripe_offset < bargs->pend &&
 		    stripe_offset + stripe_length > bargs->pstart)
@@ -4399,8 +4399,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
 		u64 mask = (1ULL << 24) - 1;
-		stripe_size = max_chunk_size;
-		do_div(stripe_size, data_stripes);
+
+		stripe_size = div_u64(max_chunk_size, data_stripes);
 
 		/* bump the answer up to a 16MB boundary */
 		stripe_size = (stripe_size + mask) & ~mask;
@@ -4412,10 +4412,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			stripe_size = devices_info[ndevs-1].max_avail;
 	}
 
-	do_div(stripe_size, dev_stripes);
+	stripe_size = div_u64(stripe_size, dev_stripes);
 
 	/* align to BTRFS_STRIPE_LEN */
-	do_div(stripe_size, raid_stripe_len);
+	stripe_size = div_u64(stripe_size, raid_stripe_len);
 	stripe_size *= raid_stripe_len;
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -5135,7 +5135,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	stripe_index = 0;
 	stripe_nr_orig = stripe_nr;
 	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
-	do_div(stripe_nr_end, map->stripe_len);
+	stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
 	stripe_end_offset = stripe_nr_end * map->stripe_len -
 			    (offset + *length);
 
@@ -5197,8 +5197,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
 		     mirror_num > 1)) {
 			/* push stripe_nr back to the start of the full stripe */
-			stripe_nr = raid56_full_stripe_start;
-			do_div(stripe_nr, stripe_len * nr_data_stripes(map));
+			stripe_nr = div_u64(raid56_full_stripe_start,
+					stripe_len * nr_data_stripes(map));
 
 			/* RAID[56] write or recovery. Return all stripes */
 			num_stripes = map->num_stripes;
@@ -5534,11 +5534,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	rmap_len = map->stripe_len;
 
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
-		do_div(length, map->num_stripes / map->sub_stripes);
+		length = div_u64(length, map->num_stripes / map->sub_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
-		do_div(length, map->num_stripes);
+		length = div_u64(length, map->num_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		do_div(length, nr_data_stripes(map));
+		length = div_u64(length, nr_data_stripes(map));
 		rmap_len = map->stripe_len * nr_data_stripes(map);
 	}
 
@@ -5553,11 +5553,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			continue;
 
 		stripe_nr = physical - map->stripes[i].physical;
-		do_div(stripe_nr, map->stripe_len);
+		stripe_nr = div_u64(stripe_nr, map->stripe_len);
 
 		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
-			do_div(stripe_nr, map->sub_stripes);
+			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
 		} /* else if RAID[56], multiply by nr_data_stripes().

From 47c5713f4737e460a3b2535abb8ae2e2afe2d2d0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 20 Feb 2015 18:43:47 +0100
Subject: [PATCH 33/93] btrfs: replace remaining do_div calls with div_u64
 variants

Switch to div_u64_rem that does type checking and has more obvious
semantics than do_div.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c      |  2 +-
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/scrub.c            |  4 ++--
 fs/btrfs/volumes.c          | 31 ++++++++++++++++---------------
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ccc3397c644..5840afe5e5f9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8670,7 +8670,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		min_free <<= 1;
 	} else if (index == BTRFS_RAID_RAID0) {
 		dev_min = fs_devices->rw_devices;
-		do_div(min_free, dev_min);
+		min_free = div64_u64(min_free, dev_min);
 	}
 
 	/* We need to do this so that we can look at pending chunks */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 13c9b46d4727..764528a4f6fd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1673,7 +1673,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		 */
 		if (*bytes >= align) {
 			tmp = entry->offset - ctl->start + align - 1;
-			do_div(tmp, align);
+			tmp = div64_u64(tmp, align);
 			tmp = tmp * align + ctl->start;
 			align_off = tmp - entry->offset;
 		} else {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 293262163daf..c21581e0be7a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2328,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
 	}
 
 	start -= sparity->logic_start;
-	offset = (int)do_div(start, sparity->stripe_len);
+	start = div_u64_rem(start, sparity->stripe_len, &offset);
 	offset /= sectorsize;
 	nsectors = (int)len / sectorsize;
 
@@ -2627,7 +2627,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
 		stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
 
 		/* Work out the disk rotation on this stripe-set */
-		rot = do_div(stripe_nr, map->num_stripes);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
 		/* calculate which stripe this data locates */
 		rot += i;
 		stripe_index = rot % map->num_stripes;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d712c46c0a6b..bf38ed09810a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4994,7 +4994,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	do_div(stripe_nr, stripe_len);
+	stripe_nr = div64_u64(stripe_nr, stripe_len);
 
 	stripe_offset = stripe_nr * stripe_len;
 	BUG_ON(offset < stripe_offset);
@@ -5010,7 +5010,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		/* allow a write of a full stripe, but make sure we don't
 		 * allow straddling of stripes
 		 */
-		do_div(raid56_full_stripe_start, full_stripe_len);
+		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+				full_stripe_len);
 		raid56_full_stripe_start *= full_stripe_len;
 	}
 
@@ -5143,7 +5144,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		if (rw & REQ_DISCARD)
 			num_stripes = min_t(u64, map->num_stripes,
 					    stripe_nr_end - stripe_nr_orig);
-		stripe_index = do_div(stripe_nr, map->num_stripes);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+				&stripe_index);
 		if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
 			mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
@@ -5171,7 +5173,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 
-		stripe_index = do_div(stripe_nr, factor);
+		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
 		stripe_index *= map->sub_stripes;
 
 		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
@@ -5208,32 +5210,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			stripe_index = 0;
 			stripe_offset = 0;
 		} else {
-			u64 tmp;
-
 			/*
 			 * Mirror #0 or #1 means the original data block.
 			 * Mirror #2 is RAID5 parity block.
 			 * Mirror #3 is RAID6 Q block.
 			 */
-			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+			stripe_nr = div_u64_rem(stripe_nr,
+					nr_data_stripes(map), &stripe_index);
 			if (mirror_num > 1)
 				stripe_index = nr_data_stripes(map) +
 						mirror_num - 2;
 
 			/* We distribute the parity blocks across stripes */
-			tmp = stripe_nr + stripe_index;
-			stripe_index = do_div(tmp, map->num_stripes);
+			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+					&stripe_index);
 			if (!(rw & (REQ_WRITE | REQ_DISCARD |
 				    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
 				mirror_num = 1;
 		}
 	} else {
 		/*
-		 * after this do_div call, stripe_nr is the number of stripes
-		 * on this device we have to walk to find the data, and
-		 * stripe_index is the number of our device in the stripe array
+		 * after this, stripe_nr is the number of stripes on this
+		 * device we have to walk to find the data, and stripe_index is
+		 * the number of our device in the stripe array
 		 */
-		stripe_index = do_div(stripe_nr, map->num_stripes);
+		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+				&stripe_index);
 		mirror_num = stripe_index + 1;
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
@@ -5268,8 +5270,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 				 sizeof(int) * tgtdev_indexes);
 
 		/* Work out the disk rotation on this stripe-set */
-		tmp = stripe_nr;
-		rot = do_div(tmp, num_stripes);
+		div_u64_rem(stripe_nr, num_stripes, &rot);
 
 		/* Fill in the logical address of each stripe */
 		tmp = stripe_nr * nr_data_stripes(map);

From 9d644a623ec48e28ca3887e616456aba63fd0558 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 20 Feb 2015 18:42:11 +0100
Subject: [PATCH 34/93] btrfs: cleanup, use correct type in div_u64_rem

div_u64_rem expects u32 for divisior and reminder.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/scrub.c   |  6 +++---
 fs/btrfs/volumes.c | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c21581e0be7a..0d1414659bc1 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2318,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
 				       unsigned long *bitmap,
 				       u64 start, u64 len)
 {
-	int offset;
+	u32 offset;
 	int nsectors;
 	int sectorsize = sparity->sctx->dev_root->sectorsize;
 
@@ -2611,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
 	int j = 0;
 	u64 stripe_nr;
 	u64 last_offset;
-	int stripe_index;
-	int rot;
+	u32 stripe_index;
+	u32 rot;
 
 	last_offset = (physical - map->stripes[num].physical) *
 		      nr_data_stripes(map);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bf38ed09810a..3df83539f532 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4953,7 +4953,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
 	u64 stripe_len;
-	int stripe_index;
+	u32 stripe_index;
 	int i;
 	int ret = 0;
 	int num_stripes;
@@ -5171,7 +5171,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-		int factor = map->num_stripes / map->sub_stripes;
+		u32 factor = map->num_stripes / map->sub_stripes;
 
 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
 		stripe_index *= map->sub_stripes;
@@ -5262,7 +5262,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	    need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
 	    mirror_num > 1)) {
 		u64 tmp;
-		int i, rot;
+		int i;
+		unsigned rot;
 
 		bbio->raid_map = (u64 *)((void *)bbio->stripes +
 				 sizeof(struct btrfs_bio_stripe) *
@@ -5285,8 +5286,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	}
 
 	if (rw & REQ_DISCARD) {
-		int factor = 0;
-		int sub_stripes = 0;
+		u32 factor = 0;
+		u32 sub_stripes = 0;
 		u64 stripes_per_dev = 0;
 		u32 remaining_stripes = 0;
 		u32 last_stripe = 0;

From 093adbcedf123f366e5eef0c4ccd815920f725f3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 24 Feb 2015 19:40:41 +0100
Subject: [PATCH 35/93] btrfs: switch helper macros to static inlines in
 sysfs.h

The conversion macros use nested container_of that leads to a warning

fs/btrfs/sysfs.c: In function 'btrfs_feature_visible':
fs/btrfs/sysfs.c:183:8: warning: declaration of '__mptr' shadows a previous local
fs/btrfs/sysfs.c:183:8: warning: shadowed declaration is here

Use of functions will add proper type checking.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f7dd298b3cf6..3a4bbed723fd 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -61,11 +61,23 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
 
 /* convert from attribute */
-#define to_btrfs_feature_attr(a) \
-			container_of(a, struct btrfs_feature_attr, kobj_attr)
-#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
-#define attr_to_btrfs_feature_attr(a) \
-			to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+static inline struct btrfs_feature_attr *
+to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+	return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+	return container_of(attr, struct kobj_attribute, attr);
+}
+
+static inline struct btrfs_feature_attr *
+attr_to_btrfs_feature_attr(struct attribute *attr)
+{
+	return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
 extern struct kobj_type space_info_ktype;

From 258ece02126a67af263746e1ae5f8ddf0d492e14 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 24 Feb 2015 19:45:15 +0100
Subject: [PATCH 36/93] btrfs: remove shadowing variables in __btrfs_map_block

1) We can safely use the function's 'i'. Fixes warning

fs/btrfs/volumes.c:5257:7: warning: declaration of 'i' shadows a previous local
fs/btrfs/volumes.c:4951:6: warning: shadowed declaration is here

2) A local variable duplicates name of an argument, we can use the value
directly. Fixes warning

fs/btrfs/volumes.c:5433:8: warning: declaration of 'length' shadows a parameter
fs/btrfs/volumes.c:4935:27: warning: shadowed declaration is here

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/volumes.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3df83539f532..64ec2fd624da 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5262,7 +5262,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	    need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
 	    mirror_num > 1)) {
 		u64 tmp;
-		int i;
 		unsigned rot;
 
 		bbio->raid_map = (u64 *)((void *)bbio->stripes +
@@ -5438,9 +5437,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			}
 		}
 		if (found) {
-			u64 length = map->stripe_len;
-
-			if (physical_of_found + length <=
+			if (physical_of_found + map->stripe_len <=
 			    dev_replace->cursor_left) {
 				struct btrfs_bio_stripe *tgtdev_stripe =
 					bbio->stripes + num_stripes;

From 4f764e5153616fccaed38e7524d6f5f89a49a060 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 23 Feb 2015 19:53:35 +0000
Subject: [PATCH 37/93] Btrfs: remove deleted xattrs on fsync log replay

If we deleted xattrs from a file and fsynced the file, after a log replay
the xattrs would remain associated to the file. This was an unexpected
behaviour and differs from what other filesystems do, such as for example
xfs and ext3/4.

Fix this by, on fsync log replay, check if every xattr in the fs/subvol
tree (that belongs to a logged inode) has a matching xattr in the log,
and if it does not, delete it from the fs/subvol tree. This is a similar
approach to what we do for dentries when we replay a directory from the
fsync log.

This issue is trivial to reproduce, and the following excerpt from my
test for xfstests triggers the issue:

  _crash_and_mount()
  {
       # Simulate a crash/power loss.
       _load_flakey_table $FLAKEY_DROP_WRITES
       _unmount_flakey
       _load_flakey_table $FLAKEY_ALLOW_WRITES
       _mount_flakey
  }

  rm -f $seqres.full

  _scratch_mkfs >> $seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create out test file and add 3 xattrs to it.
  touch $SCRATCH_MNT/foobar
  $SETFATTR_PROG -n user.attr1 -v val1 $SCRATCH_MNT/foobar
  $SETFATTR_PROG -n user.attr2 -v val2 $SCRATCH_MNT/foobar
  $SETFATTR_PROG -n user.attr3 -v val3 $SCRATCH_MNT/foobar

  # Make sure everything is durably persisted.
  sync

  # Now delete the second xattr and fsync the inode.
  $SETFATTR_PROG -x user.attr2 $SCRATCH_MNT/foobar
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar

  _crash_and_mount

  # After the fsync log is replayed, the file should have only 2 xattrs, the ones
  # named user.attr1 and user.attr3. The btrfs fsync log replay bug left the file
  # with the 3 xattrs that we had before deleting the second one and fsyncing the
  # file.
  echo "xattr names and values after first fsync log replay:"
  $GETFATTR_PROG --absolute-names --dump $SCRATCH_MNT/foobar | _filter_scratch

  # Now write some data to our file, fsync it, remove the first xattr, add a new
  # hard link to our file and commit the fsync log by fsyncing some other new
  # file. This is to verify that after log replay our first xattr does not exist
  # anymore.
  echo "hello world!" >> $SCRATCH_MNT/foobar
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar
  $SETFATTR_PROG -x user.attr1 $SCRATCH_MNT/foobar
  ln $SCRATCH_MNT/foobar $SCRATCH_MNT/foobar_link
  touch $SCRATCH_MNT/qwerty
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/qwerty

  _crash_and_mount

  # Now only the xattr with name user.attr3 should be set in our file.
  echo "xattr names and values after second fsync log replay:"
  $GETFATTR_PROG --absolute-names --dump $SCRATCH_MNT/foobar | _filter_scratch

  status=0
  exit

The expected golden output, which is produced with this patch applied or
when testing against xfs or ext3/4, is:

  xattr names and values after first fsync log replay:
  # file: SCRATCH_MNT/foobar
  user.attr1="val1"
  user.attr3="val3"

  xattr names and values after second fsync log replay:
  # file: SCRATCH_MNT/foobar
  user.attr3="val3"

Without this patch applied, the output is:

  xattr names and values after first fsync log replay:
  # file: SCRATCH_MNT/foobar
  user.attr1="val1"
  user.attr2="val2"
  user.attr3="val3"

  xattr names and values after second fsync log replay:
  # file: SCRATCH_MNT/foobar
  user.attr1="val1"
  user.attr2="val2"
  user.attr3="val3"

A patch with a test case for xfstests follows soon.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 123 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 109 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 066e754b1294..6c95159302dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1951,6 +1951,104 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_root *log,
+			      struct btrfs_path *path,
+			      const u64 ino)
+{
+	struct btrfs_key search_key;
+	struct btrfs_path *log_path;
+	int i;
+	int nritems;
+	int ret;
+
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	search_key.objectid = ino;
+	search_key.type = BTRFS_XATTR_ITEM_KEY;
+	search_key.offset = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+process_leaf:
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	for (i = path->slots[0]; i < nritems; i++) {
+		struct btrfs_key key;
+		struct btrfs_dir_item *di;
+		struct btrfs_dir_item *log_di;
+		u32 total_size;
+		u32 cur;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
+
+		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+		total_size = btrfs_item_size_nr(path->nodes[0], i);
+		cur = 0;
+		while (cur < total_size) {
+			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+			u32 this_len = sizeof(*di) + name_len + data_len;
+			char *name;
+
+			name = kmalloc(name_len, GFP_NOFS);
+			if (!name) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			read_extent_buffer(path->nodes[0], name,
+					   (unsigned long)(di + 1), name_len);
+
+			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+						    name, name_len, 0);
+			btrfs_release_path(log_path);
+			if (!log_di) {
+				/* Doesn't exist in log tree, so delete it. */
+				btrfs_release_path(path);
+				di = btrfs_lookup_xattr(trans, root, path, ino,
+							name, name_len, -1);
+				kfree(name);
+				if (IS_ERR(di)) {
+					ret = PTR_ERR(di);
+					goto out;
+				}
+				ASSERT(di);
+				ret = btrfs_delete_one_dir_name(trans, root,
+								path, di);
+				if (ret)
+					goto out;
+				btrfs_release_path(path);
+				search_key = key;
+				goto again;
+			}
+			kfree(name);
+			if (IS_ERR(log_di)) {
+				ret = PTR_ERR(log_di);
+				goto out;
+			}
+			cur += this_len;
+			di = (struct btrfs_dir_item *)((char *)di + this_len);
+		}
+	}
+	ret = btrfs_next_leaf(root, path);
+	if (ret > 0)
+		ret = 0;
+	else if (ret == 0)
+		goto process_leaf;
+out:
+	btrfs_free_path(log_path);
+	btrfs_release_path(path);
+	return ret;
+}
+
+
 /*
  * deletion replay happens before we copy any new directory items
  * out of the log or out of backreferences from inodes.  It
@@ -2104,6 +2202,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 
 			inode_item = btrfs_item_ptr(eb, i,
 					    struct btrfs_inode_item);
+			ret = replay_xattr_deletes(wc->trans, root, log,
+						   path, key.objectid);
+			if (ret)
+				break;
 			mode = btrfs_inode_mode(eb, inode_item);
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
@@ -4072,10 +4174,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	if (S_ISDIR(inode->i_mode)) {
 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
 
-		if (inode_only == LOG_INODE_EXISTS) {
-			max_key_type = BTRFS_INODE_EXTREF_KEY;
-			max_key.type = max_key_type;
-		}
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
 	} else {
 		if (inode_only == LOG_INODE_EXISTS) {
@@ -4100,7 +4200,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &BTRFS_I(inode)->runtime_flags)) {
 			if (inode_only == LOG_INODE_EXISTS) {
-				max_key.type = BTRFS_INODE_EXTREF_KEY;
+				max_key.type = BTRFS_XATTR_ITEM_KEY;
 				ret = drop_objectid_items(trans, log, path, ino,
 							  max_key.type);
 			} else {
@@ -4111,17 +4211,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 				ret = btrfs_truncate_inode_items(trans, log,
 								 inode, 0, 0);
 			}
-		} else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
-				    &BTRFS_I(inode)->runtime_flags) ||
+		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+					      &BTRFS_I(inode)->runtime_flags) ||
 			   inode_only == LOG_INODE_EXISTS) {
-			if (inode_only == LOG_INODE_ALL) {
-				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-					  &BTRFS_I(inode)->runtime_flags);
+			if (inode_only == LOG_INODE_ALL)
 				fast_search = true;
-				max_key.type = BTRFS_XATTR_ITEM_KEY;
-			} else {
-				max_key.type = BTRFS_INODE_EXTREF_KEY;
-			}
+			max_key.type = BTRFS_XATTR_ITEM_KEY;
 			ret = drop_objectid_items(trans, log, path, ino,
 						  max_key.type);
 		} else {

From 5cc2b17e80cf5770f2e585c2d90fd8af1b901258 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 2 Mar 2015 20:53:52 +0000
Subject: [PATCH 38/93] Btrfs: send, add missing check for dead clone root

After we locked the root's root item, a concurrent snapshot deletion
call might have set the dead flag on it. So check if the dead flag
is set and abort if it is, just like we do for the parent root.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d6033f540cc7..6ec28f13659e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5855,7 +5855,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 			clone_sources_to_rollback = i + 1;
 			spin_lock(&clone_root->root_item_lock);
 			clone_root->send_in_progress++;
-			if (!btrfs_root_readonly(clone_root)) {
+			if (!btrfs_root_readonly(clone_root) ||
+			    btrfs_root_dead(clone_root)) {
 				spin_unlock(&clone_root->root_item_lock);
 				srcu_read_unlock(&fs_info->subvol_srcu, index);
 				ret = -EPERM;

From 2f1f465ae6da244099af55c066e5355abd8ff620 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 2 Mar 2015 20:53:53 +0000
Subject: [PATCH 39/93] Btrfs: send, don't leave without decrementing clone
 root's send_progress

If the clone root was not readonly or the dead flag was set on it, we were
leaving without decrementing the root's send_progress counter (and before
we just incremented it). If a concurrent snapshot deletion was in progress
and ended up being aborted, it would be impossible to later attempt to
delete again the snapshot, since the root's send_in_progress counter could
never go back to 0.

We were also setting clone_sources_to_rollback to i + 1 too early - if we
bailed out because the clone root we got is not readonly or flagged as dead
we ended up later derreferencing a null pointer because we didn't assign
the clone root to sctx->clone_roots[i].root:

		for (i = 0; sctx && i < clone_sources_to_rollback; i++)
			btrfs_root_dec_send_in_progress(
					sctx->clone_roots[i].root);

So just don't increment the send_in_progress counter if the root is readonly
or flagged as dead.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6ec28f13659e..571de5a08fe7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5852,9 +5852,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 				ret = PTR_ERR(clone_root);
 				goto out;
 			}
-			clone_sources_to_rollback = i + 1;
 			spin_lock(&clone_root->root_item_lock);
-			clone_root->send_in_progress++;
 			if (!btrfs_root_readonly(clone_root) ||
 			    btrfs_root_dead(clone_root)) {
 				spin_unlock(&clone_root->root_item_lock);
@@ -5862,10 +5860,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 				ret = -EPERM;
 				goto out;
 			}
+			clone_root->send_in_progress++;
 			spin_unlock(&clone_root->root_item_lock);
 			srcu_read_unlock(&fs_info->subvol_srcu, index);
 
 			sctx->clone_roots[i].root = clone_root;
+			clone_sources_to_rollback = i + 1;
 		}
 		vfree(clone_sources_tmp);
 		clone_sources_tmp = NULL;

From 8996a48c0a8ed01c55f52e6794625c95a9977ee6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 12 Mar 2015 15:16:20 +0000
Subject: [PATCH 40/93] Btrfs: incremental send, clear name from cache after
 orphanization

If a directory's reference ends up being orphanized, because the inode
currently being processed has a new path that matches that directory's
path, make sure we evict the name of the directory from the name cache.
This is because there might be descendent inodes (either directories or
regular files) that will be orphanized later too, and therefore the
orphan name of the ancestor must be used, otherwise we send issue rename
operations with a wrong path in the send stream.

Reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ mkdir -p /mnt/data/n1/n2/p1/p2
  $ mkdir /mnt/data/n4
  $ mkdir -p /mnt/data/p1/p2

  $ btrfs subvolume snapshot -r /mnt /mnt/snap1

  $ mv /mnt/data/p1/p2 /mnt/data
  $ mv /mnt/data/n1/n2/p1/p2 /mnt/data/p1
  $ mv /mnt/data/p2 /mnt/data/n1/n2/p1
  $ mv /mnt/data/n1/n2 /mnt/data/p1
  $ mv /mnt/data/p1 /mnt/data/n4
  $ mv /mnt/data/n4/p1/n2/p1 /mnt/data

  $ btrfs subvolume snapshot -r /mnt /mnt/snap2

  $ btrfs send /mnt/snap1 -f /tmp/1.send
  $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/2.send

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt2
  $ btrfs receive /mnt2 -f /tmp/1.send
  $ btrfs receive /mnt2 -f /tmp/2.send
  ERROR: rename data/p1/p2 -> data/n4/p1/p2 failed. no such file or directory

Directories data/p1 (inode 263) and data/p1/p2 (inode 264) in the parent
snapshot are both orphanized during the incremental send, and as soon as
data/p1 is orphanized, we must make sure that when orphanizing data/p1/p2
we use a source path of o263-6-o/p2 for the rename operation instead of
the old path data/p1/p2 (the one before the orphanization of inode 263).

A test case for xfstests follows soon.

Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 571de5a08fe7..00cb924884f5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3610,10 +3610,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 			if (ret) {
+				struct name_cache_entry *nce;
+
 				ret = orphanize_inode(sctx, ow_inode, ow_gen,
 						cur->full_path);
 				if (ret < 0)
 					goto out;
+				/*
+				 * Make sure we clear our orphanized inode's
+				 * name from the name cache. This is because the
+				 * inode ow_inode might be an ancestor of some
+				 * other inode that will be orphanized as well
+				 * later and has an inode number greater than
+				 * sctx->send_progress. We need to prevent
+				 * future name lookups from using the old name
+				 * and get instead the orphan name.
+				 */
+				nce = name_cache_search(sctx, ow_inode, ow_gen);
+				if (nce) {
+					name_cache_delete(sctx, nce);
+					kfree(nce);
+				}
 			} else {
 				ret = send_unlink(sctx, cur->full_path);
 				if (ret < 0)

From 5f806c3ae2ff6263a10a6901f97abb74dac03d36 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 12 Mar 2015 16:04:50 +0000
Subject: [PATCH 41/93] Btrfs: incremental send, remove dead code

The logic to detect path loops when attempting to apply a pending
directory rename, introduced in commit
f959492fc15b (Btrfs: send, fix more issues related to directory renames)
is no longer needed, and the respective fstests test case for that commit,
btrfs/045, now passes without this code (as well as all the other test
cases for send/receive).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 59 -------------------------------------------------
 1 file changed, 59 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 00cb924884f5..a1216f9b4917 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3067,48 +3067,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
 	return NULL;
 }
 
-static int path_loop(struct send_ctx *sctx, struct fs_path *name,
-		     u64 ino, u64 gen, u64 *ancestor_ino)
-{
-	int ret = 0;
-	u64 parent_inode = 0;
-	u64 parent_gen = 0;
-	u64 start_ino = ino;
-
-	*ancestor_ino = 0;
-	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
-		fs_path_reset(name);
-
-		if (is_waiting_for_rm(sctx, ino))
-			break;
-		if (is_waiting_for_move(sctx, ino)) {
-			if (*ancestor_ino == 0)
-				*ancestor_ino = ino;
-			ret = get_first_ref(sctx->parent_root, ino,
-					    &parent_inode, &parent_gen, name);
-		} else {
-			ret = __get_cur_name_and_parent(sctx, ino, gen,
-							&parent_inode,
-							&parent_gen, name);
-			if (ret > 0) {
-				ret = 0;
-				break;
-			}
-		}
-		if (ret < 0)
-			break;
-		if (parent_inode == start_ino) {
-			ret = 1;
-			if (*ancestor_ino == 0)
-				*ancestor_ino = ino;
-			break;
-		}
-		ino = parent_inode;
-		gen = parent_gen;
-	}
-	return ret;
-}
-
 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
 	struct fs_path *from_path = NULL;
@@ -3120,7 +3078,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	struct waiting_dir_move *dm = NULL;
 	u64 rmdir_ino = 0;
 	int ret;
-	u64 ancestor = 0;
 
 	name = fs_path_alloc();
 	from_path = fs_path_alloc();
@@ -3152,22 +3109,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 		goto out;
 
 	sctx->send_progress = sctx->cur_ino + 1;
-	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
-	if (ret) {
-		LIST_HEAD(deleted_refs);
-		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
-		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
-					   &pm->update_refs, &deleted_refs,
-					   pm->is_orphan);
-		if (ret < 0)
-			goto out;
-		if (rmdir_ino) {
-			dm = get_waiting_dir_move(sctx, pm->ino);
-			ASSERT(dm);
-			dm->rmdir_ino = rmdir_ino;
-		}
-		goto out;
-	}
 	fs_path_reset(name);
 	to_path = name;
 	name = NULL;

From 3d850dd44889d3aa67d0b8007c2cdd259bff7da4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 12 Mar 2015 23:23:13 +0000
Subject: [PATCH 42/93] Btrfs: add missing inode item update in fallocate()

If we fallocate(), without the keep size flag, into an area already covered
by an extent previously fallocated, we were updating the inode's i_size but
we weren't updating the inode item in the fs/subvol tree. A following umount
+ mount would result in a loss of the inode's size (and an fsync would miss
too the fact that the inode changed).

Reproducer:

  $ mkfs.btrfs -f /dev/sdd
  $ mount /dev/sdd /mnt
  $ fallocate -n -l 1M /mnt/foobar
  $ fallocate -l 512K /mnt/foobar
  $ umount /mnt
  $ mount /dev/sdd /mnt
  $ od -t x1 /mnt/foobar
  0000000

The expected result is:

  $ od -t x1 /mnt/foobar
  0000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  *
  2000000

A test case for fstests follows soon.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7d4bb3b6fbc2..150db5e50c2d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2669,23 +2669,34 @@ static long btrfs_fallocate(struct file *file, int mode,
 							1 << inode->i_blkbits,
 							offset + len,
 							&alloc_hint);
-
-			if (ret < 0) {
-				free_extent_map(em);
-				break;
-			}
 		} else if (actual_end > inode->i_size &&
 			   !(mode & FALLOC_FL_KEEP_SIZE)) {
+			struct btrfs_trans_handle *trans;
+
 			/*
 			 * We didn't need to allocate any more space, but we
 			 * still extended the size of the file so we need to
-			 * update i_size.
+			 * update i_size and the inode item.
 			 */
-			inode->i_ctime = CURRENT_TIME;
-			i_size_write(inode, actual_end);
-			btrfs_ordered_update_i_size(inode, actual_end, NULL);
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+			} else {
+				inode->i_ctime = CURRENT_TIME;
+				i_size_write(inode, actual_end);
+				btrfs_ordered_update_i_size(inode, actual_end,
+							    NULL);
+				ret = btrfs_update_inode(trans, root, inode);
+				if (ret)
+					btrfs_end_transaction(trans, root);
+				else
+					ret = btrfs_end_transaction(trans,
+								    root);
+			}
 		}
 		free_extent_map(em);
+		if (ret < 0)
+			break;
 
 		cur_offset = last_byte;
 		if (cur_offset >= alloc_end) {

From bf69196045a8c5c42b10493a26ed45c33014371e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sat, 14 Mar 2015 07:03:27 +0000
Subject: [PATCH 43/93] Btrfs: change the insertion criteria for the qgroup
 operations rbtree

After looking at Liu Bo's recent patch (titled
"Btrfs: fix comp_oper to get right order") I realized the search made by
qgroup_oper_exists() was buggy because its rbtree navigation comparison
function, comp_oper_exist(), only looks at the fields bytenr and ref_root
of a tree node, ignoring the seq field completely. This was wrong because
when we insert a node into the rbtree we use comp_oper(), which takes a
decision based first on bytenr, then on seq and then on the ref_root field.
That means qgroup_oper_exists() could miss the fact that at least one
operation with given bytenr and ref_root exists.

Consider the following simple example of a 3 nodes qgroup operations
rbtree (created using comp_oper before this patch), where each node's key
is a tuple with the shape (bytenr, seq, ref_root, op):

                          [ (4096, 2, 20, op X) ]
                         /                       \
                        /                         \
   [ (4096, 1, 5, op Y) ]                         [ (4096, 3, 10, op Z) ]

qgroup_oper_exists() when called to search for an existing operation for
bytenr 4096 and ref root 10 wouldn't find anything because it would go to
the left subtree instead of the right subtree, since comp_oper_exits()
ignores the seq field completely.

Fix this by changing the insertion navigation function to use the ref_root
field right after using the bytenr field and before using the seq field,
so that qgroup_oper_exists() / comp_oper_exist() work as expected.

This patch applies on top of the patch mentioned above from Liu.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 33f31375259a..37188c6b408d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1256,14 +1256,14 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
 		return -1;
 	if (oper1->bytenr > oper2->bytenr)
 		return 1;
-	if (oper1->seq < oper2->seq)
-		return -1;
-	if (oper1->seq > oper2->seq)
-		return 1;
 	if (oper1->ref_root < oper2->ref_root)
 		return -1;
 	if (oper1->ref_root > oper2->ref_root)
 		return 1;
+	if (oper1->seq < oper2->seq)
+		return -1;
+	if (oper1->seq > oper2->seq)
+		return 1;
 	if (oper1->type < oper2->type)
 		return -1;
 	if (oper1->type > oper2->type)

From 2f2ff0ee5e4303e727cfd7abd4133d1a8ee68394 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 20 Mar 2015 17:19:46 +0000
Subject: [PATCH 44/93] Btrfs: fix metadata inconsistencies after directory
 fsync

We can get into inconsistency between inodes and directory entries
after fsyncing a directory. The issue is that while a directory gets
the new dentries persisted in the fsync log and replayed at mount time,
the link count of the inode that directory entries point to doesn't
get updated, staying with an incorrect link count (smaller then the
correct value). This later leads to stale file handle errors when
accessing (including attempt to delete) some of the links if all the
other ones are removed, which also implies impossibility to delete the
parent directories, since the dentries can not be removed.

Another issue is that (unlike ext3/4, xfs, f2fs, reiserfs, nilfs2),
when fsyncing a directory, new files aren't logged (their metadata and
dentries) nor any child directories. So this patch fixes this issue too,
since it has the same resolution as the incorrect inode link count issue
mentioned before.

This is very easy to reproduce, and the following excerpt from my test
case for xfstests shows how:

  _scratch_mkfs >> $seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our main test file and directory.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 8K" $SCRATCH_MNT/foo | _filter_xfs_io
  mkdir $SCRATCH_MNT/mydir

  # Make sure all metadata and data are durably persisted.
  sync

  # Add a hard link to 'foo' inside our test directory and fsync only the
  # directory. The btrfs fsync implementation had a bug that caused the new
  # directory entry to be visible after the fsync log replay but, the inode
  # of our file remained with a link count of 1.
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_2

  # Add a few more links and new files.
  # This is just to verify nothing breaks or gives incorrect results after the
  # fsync log is replayed.
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_3
  $XFS_IO_PROG -f -c "pwrite -S 0xff 0 64K" $SCRATCH_MNT/hello | _filter_xfs_io
  ln $SCRATCH_MNT/hello $SCRATCH_MNT/mydir/hello_2

  # Add some subdirectories and new files and links to them. This is to verify
  # that after fsyncing our top level directory 'mydir', all the subdirectories
  # and their files/links are registered in the fsync log and exist after the
  # fsync log is replayed.
  mkdir -p $SCRATCH_MNT/mydir/x/y/z
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/foo_y_link
  ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/z/foo_z_link
  touch $SCRATCH_MNT/mydir/x/y/z/qwerty

  # Now fsync only our top directory.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/mydir

  # And fsync now our new file named 'hello', just to verify later that it has
  # the expected content and that the previous fsync on the directory 'mydir' had
  # no bad influence on this fsync.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/hello

  # Simulate a crash/power loss.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  # Verify the content of our file 'foo' remains the same as before, 8192 bytes,
  # all with the value 0xaa.
  echo "File 'foo' content after log replay:"
  od -t x1 $SCRATCH_MNT/foo

  # Remove the first name of our inode. Because of the directory fsync bug, the
  # inode's link count was 1 instead of 5, so removing the 'foo' name ended up
  # deleting the inode and the other names became stale directory entries (still
  # visible to applications). Attempting to remove or access the remaining
  # dentries pointing to that inode resulted in stale file handle errors and
  # made it impossible to remove the parent directories since it was impossible
  # for them to become empty.
  echo "file 'foo' link count after log replay: $(stat -c %h $SCRATCH_MNT/foo)"
  rm -f $SCRATCH_MNT/foo

  # Now verify that all files, links and directories created before fsyncing our
  # directory exist after the fsync log was replayed.
  [ -f $SCRATCH_MNT/mydir/foo_2 ] || echo "Link mydir/foo_2 is missing"
  [ -f $SCRATCH_MNT/mydir/foo_3 ] || echo "Link mydir/foo_3 is missing"
  [ -f $SCRATCH_MNT/hello ] || echo "File hello is missing"
  [ -f $SCRATCH_MNT/mydir/hello_2 ] || echo "Link mydir/hello_2 is missing"
  [ -f $SCRATCH_MNT/mydir/x/y/foo_y_link ] || \
      echo "Link mydir/x/y/foo_y_link is missing"
  [ -f $SCRATCH_MNT/mydir/x/y/z/foo_z_link ] || \
      echo "Link mydir/x/y/z/foo_z_link is missing"
  [ -f $SCRATCH_MNT/mydir/x/y/z/qwerty ] || \
      echo "File mydir/x/y/z/qwerty is missing"

  # We expect our file here to have a size of 64Kb and all the bytes having the
  # value 0xff.
  echo "file 'hello' content after log replay:"
  od -t x1 $SCRATCH_MNT/hello

  # Now remove all files/links, under our test directory 'mydir', and verify we
  # can remove all the directories.
  rm -f $SCRATCH_MNT/mydir/x/y/z/*
  rmdir $SCRATCH_MNT/mydir/x/y/z
  rm -f $SCRATCH_MNT/mydir/x/y/*
  rmdir $SCRATCH_MNT/mydir/x/y
  rmdir $SCRATCH_MNT/mydir/x
  rm -f $SCRATCH_MNT/mydir/*
  rmdir $SCRATCH_MNT/mydir

  # An fsck, run by the fstests framework everytime a test finishes, also detected
  # the inconsistency and printed the following error message:
  #
  # root 5 inode 257 errors 2001, no inode item, link count wrong
  #    unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref
  #    unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref

  status=0
  exit

The expected golden output for the test is:

  wrote 8192/8192 bytes at offset 0
  XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  wrote 65536/65536 bytes at offset 0
  XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  File 'foo' content after log replay:
  0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
  *
  0020000
  file 'foo' link count after log replay: 5
  file 'hello' content after log replay:
  0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
  *
  0200000

Which is the output after this patch and when running the test against
ext3/4, xfs, f2fs, reiserfs or nilfs2. Without this patch, the test's
output is:

  wrote 8192/8192 bytes at offset 0
  XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  wrote 65536/65536 bytes at offset 0
  XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  File 'foo' content after log replay:
  0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
  *
  0020000
  file 'foo' link count after log replay: 1
  Link mydir/foo_2 is missing
  Link mydir/foo_3 is missing
  Link mydir/x/y/foo_y_link is missing
  Link mydir/x/y/z/foo_z_link is missing
  File mydir/x/y/z/qwerty is missing
  file 'hello' content after log replay:
  0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
  *
  0200000
  rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y/z': No such file or directory
  rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y': No such file or directory
  rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x': No such file or directory
  rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_2': Stale file handle
  rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_3': Stale file handle
  rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir': Directory not empty

Fsck, without this fix, also complains about the wrong link count:

  root 5 inode 257 errors 2001, no inode item, link count wrong
      unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref
      unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref

So fix this by logging the inodes that the dentries point to when
fsyncing a directory.

A test case for xfstests follows.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h |  14 ++-
 fs/btrfs/file.c        |   2 +
 fs/btrfs/transaction.h |   2 +
 fs/btrfs/tree-log.c    | 243 +++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/tree-log.h    |   2 +
 5 files changed, 253 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2adfea..0ef5cc13fae2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
 	 */
 	struct btrfs_key location;
 
-	/* Lock for counters */
+	/*
+	 * Lock for counters and all fields used to determine if the inode is in
+	 * the log or not (last_trans, last_sub_trans, last_log_commit,
+	 * logged_trans).
+	 */
 	spinlock_t lock;
 
 	/* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
+	int ret = 0;
+
+	spin_lock(&BTRFS_I(inode)->lock);
 	if (BTRFS_I(inode)->logged_trans == generation &&
 	    BTRFS_I(inode)->last_sub_trans <=
 	    BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 		 */
 		smp_mb();
 		if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
-			return 1;
+			ret = 1;
 	}
-	return 0;
+	spin_unlock(&BTRFS_I(inode)->lock);
+	return ret;
 }
 
 #define BTRFS_DIO_ORIG_BIO_SUBMITTED	0x1
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 150db5e50c2d..fd105c172c8b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,7 +1811,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 * otherwise subsequent syncs to a file that's been synced in this
 	 * transaction will appear to have already occured.
 	 */
+	spin_lock(&BTRFS_I(inode)->lock);
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
+	spin_unlock(&BTRFS_I(inode)->lock);
 	if (num_written > 0) {
 		err = generic_write_sync(file, pos, num_written);
 		if (err < 0)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 937050a2b68e..96b189b8898a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -136,9 +136,11 @@ struct btrfs_pending_snapshot {
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 					      struct inode *inode)
 {
+	spin_lock(&BTRFS_I(inode)->lock);
 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6c95159302dd..016c90fc85db 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 
 		if (btrfs_inode_generation(eb, src_item) == 0) {
 			struct extent_buffer *dst_eb = path->nodes[0];
+			const u64 ino_size = btrfs_inode_size(eb, src_item);
 
+			/*
+			 * For regular files an ino_size == 0 is used only when
+			 * logging that an inode exists, as part of a directory
+			 * fsync, and the inode wasn't fsynced before. In this
+			 * case don't set the size of the inode in the fs/subvol
+			 * tree, otherwise we would be throwing valid data away.
+			 */
 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
-			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+			    ino_size != 0) {
 				struct btrfs_map_token token;
-				u64 ino_size = btrfs_inode_size(eb, src_item);
 
 				btrfs_init_map_token(&token);
 				btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -3124,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct inode *inode,
 			  struct btrfs_path *path,
 			  struct btrfs_path *dst_path, int key_type,
+			  struct btrfs_log_ctx *ctx,
 			  u64 min_offset, u64 *last_offset_ret)
 {
 	struct btrfs_key min_key;
@@ -3208,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 		src = path->nodes[0];
 		nritems = btrfs_header_nritems(src);
 		for (i = path->slots[0]; i < nritems; i++) {
+			struct btrfs_dir_item *di;
+
 			btrfs_item_key_to_cpu(src, &min_key, i);
 
 			if (min_key.objectid != ino || min_key.type != key_type)
@@ -3218,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 				err = ret;
 				goto done;
 			}
+
+			/*
+			 * We must make sure that when we log a directory entry,
+			 * the corresponding inode, after log replay, has a
+			 * matching link count. For example:
+			 *
+			 * touch foo
+			 * mkdir mydir
+			 * sync
+			 * ln foo mydir/bar
+			 * xfs_io -c "fsync" mydir
+			 * <crash>
+			 * <mount fs and log replay>
+			 *
+			 * Would result in a fsync log that when replayed, our
+			 * file inode would have a link count of 1, but we get
+			 * two directory entries pointing to the same inode.
+			 * After removing one of the names, it would not be
+			 * possible to remove the other name, which resulted
+			 * always in stale file handle errors, and would not
+			 * be possible to rmdir the parent directory, since
+			 * its i_size could never decrement to the value
+			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+			 */
+			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(src, di, &tmp);
+			if (ctx &&
+			    (btrfs_dir_transid(src, di) == trans->transid ||
+			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+			    tmp.type != BTRFS_ROOT_ITEM_KEY)
+				ctx->log_new_dentries = true;
 		}
 		path->slots[0] = nritems;
 
@@ -3279,7 +3321,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct inode *inode,
 			  struct btrfs_path *path,
-			  struct btrfs_path *dst_path)
+			  struct btrfs_path *dst_path,
+			  struct btrfs_log_ctx *ctx)
 {
 	u64 min_key;
 	u64 max_key;
@@ -3291,7 +3334,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	max_key = 0;
 	while (1) {
 		ret = log_dir_items(trans, root, inode, path,
-				    dst_path, key_type, min_key,
+				    dst_path, key_type, ctx, min_key,
 				    &max_key);
 		if (ret)
 			return ret;
@@ -4067,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
 	if (ret < 0) {
 		return ret;
 	} else if (ret > 0) {
-		*size_ret = i_size_read(inode);
+		*size_ret = 0;
 	} else {
 		struct btrfs_inode_item *item;
 
@@ -4374,15 +4417,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-		ret = log_directory_changes(trans, root, inode, path, dst_path);
+		ret = log_directory_changes(trans, root, inode, path, dst_path,
+					    ctx);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
 		}
 	}
 
+	spin_lock(&BTRFS_I(inode)->lock);
 	BTRFS_I(inode)->logged_trans = trans->transid;
 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+	spin_unlock(&BTRFS_I(inode)->lock);
 out_unlock:
 	if (unlikely(err))
 		btrfs_put_logged_extents(&logged_list);
@@ -4469,6 +4515,181 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+struct btrfs_dir_list {
+	u64 ino;
+	struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ *        CPU0                                        CPU1
+ *        ----                                        ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ *                                            lock(sb_internal#2);
+ *                                            lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ *    that while logging the inode new references (names) are added or removed
+ *    from the inode, leaving the logged inode item with a link count that does
+ *    not match the number of logged inode reference items. This is fine because
+ *    at log replay time we compute the real number of links and correct the
+ *    link count in the inode item (see replay_one_buffer() and
+ *    link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    has a size that doesn't match the sum of the lengths of all the logged
+ *    names. This does not result in a problem because if a dir_item key is
+ *    logged but its matching dir_index key is not logged, at log replay time we
+ *    don't use it to replay the respective name (see replay_one_name()). On the
+ *    other hand if only the dir_index key ends up being logged, the respective
+ *    name is added to the fs/subvol tree with both the dir_item and dir_index
+ *    keys created (see replay_one_name()).
+ *    The directory's inode item with a wrong i_size is not a problem as well,
+ *    since we don't use it at log replay time to set the i_size in the inode
+ *    item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *start_inode,
+				struct btrfs_log_ctx *ctx)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_path *path;
+	LIST_HEAD(dir_list);
+	struct btrfs_dir_list *dir_elem;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+	if (!dir_elem) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+	dir_elem->ino = btrfs_ino(start_inode);
+	list_add_tail(&dir_elem->list, &dir_list);
+
+	while (!list_empty(&dir_list)) {
+		struct extent_buffer *leaf;
+		struct btrfs_key min_key;
+		int nritems;
+		int i;
+
+		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+					    list);
+		if (ret)
+			goto next_dir_inode;
+
+		min_key.objectid = dir_elem->ino;
+		min_key.type = BTRFS_DIR_ITEM_KEY;
+		min_key.offset = 0;
+again:
+		btrfs_release_path(path);
+		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+		if (ret < 0) {
+			goto next_dir_inode;
+		} else if (ret > 0) {
+			ret = 0;
+			goto next_dir_inode;
+		}
+
+process_leaf:
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		for (i = path->slots[0]; i < nritems; i++) {
+			struct btrfs_dir_item *di;
+			struct btrfs_key di_key;
+			struct inode *di_inode;
+			struct btrfs_dir_list *new_dir_elem;
+			int log_mode = LOG_INODE_EXISTS;
+			int type;
+
+			btrfs_item_key_to_cpu(leaf, &min_key, i);
+			if (min_key.objectid != dir_elem->ino ||
+			    min_key.type != BTRFS_DIR_ITEM_KEY)
+				goto next_dir_inode;
+
+			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+			type = btrfs_dir_type(leaf, di);
+			if (btrfs_dir_transid(leaf, di) < trans->transid &&
+			    type != BTRFS_FT_DIR)
+				continue;
+			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+				continue;
+
+			di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+					      root, NULL);
+			if (IS_ERR(di_inode)) {
+				ret = PTR_ERR(di_inode);
+				goto next_dir_inode;
+			}
+
+			if (btrfs_inode_in_log(di_inode, trans->transid)) {
+				iput(di_inode);
+				continue;
+			}
+
+			ctx->log_new_dentries = false;
+			if (type == BTRFS_FT_DIR)
+				log_mode = LOG_INODE_ALL;
+			btrfs_release_path(path);
+			ret = btrfs_log_inode(trans, root, di_inode,
+					      log_mode, 0, LLONG_MAX, ctx);
+			iput(di_inode);
+			if (ret)
+				goto next_dir_inode;
+			if (ctx->log_new_dentries) {
+				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+						       GFP_NOFS);
+				if (!new_dir_elem) {
+					ret = -ENOMEM;
+					goto next_dir_inode;
+				}
+				new_dir_elem->ino = di_key.objectid;
+				list_add_tail(&new_dir_elem->list, &dir_list);
+			}
+			break;
+		}
+		if (i == nritems) {
+			ret = btrfs_next_leaf(log, path);
+			if (ret < 0) {
+				goto next_dir_inode;
+			} else if (ret > 0) {
+				ret = 0;
+				goto next_dir_inode;
+			}
+			goto process_leaf;
+		}
+		if (min_key.offset < (u64)-1) {
+			min_key.offset++;
+			goto again;
+		}
+next_dir_inode:
+		list_del(&dir_elem->list);
+		kfree(dir_elem);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -4491,6 +4712,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	const struct dentry * const first_parent = parent;
 	const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
 				 last_committed);
+	bool log_dentries = false;
+	struct inode *orig_inode = inode;
 
 	sb = inode->i_sb;
 
@@ -4546,6 +4769,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_trans;
 	}
 
+	if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+		log_dentries = true;
+
 	while (1) {
 		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
 			break;
@@ -4582,7 +4808,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		dput(old_parent);
 		old_parent = parent;
 	}
-	ret = 0;
+	if (log_dentries)
+		ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+	else
+		ret = 0;
 end_trans:
 	dput(old_parent);
 	if (ret < 0) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 154990c26dcb..6916a781ea02 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
 	int log_ret;
 	int log_transid;
 	int io_err;
+	bool log_new_dentries;
 	struct list_head list;
 };
 
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
 	ctx->log_ret = 0;
 	ctx->log_transid = 0;
 	ctx->io_err = 0;
+	ctx->log_new_dentries = false;
 	INIT_LIST_HEAD(&ctx->list);
 }
 

From dcc82f4783ad91d4ab654f89f37ae9291cdc846a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 23 Mar 2015 14:07:40 +0000
Subject: [PATCH 45/93] Btrfs: fix log tree corruption when fs mounted with -o
 discard

While committing a transaction we free the log roots before we write the
new super block. Freeing the log roots implies marking the disk location
of every node/leaf (metadata extent) as pinned before the new super block
is written. This is to prevent the disk location of log metadata extents
from being reused before the new super block is written, otherwise we
would have a corrupted log tree if before the new super block is written
a crash/reboot happens and the location of any log tree metadata extent
ended up being reused and rewritten.

Even though we pinned the log tree's metadata extents, we were issuing a
discard against them if the fs was mounted with the -o discard option,
resulting in corruption of the log tree if a crash/reboot happened before
writing the new super block - the next time the fs was mounted, during
the log replay process we would find nodes/leafs of the log btree with
a content full of zeroes, causing the process to fail and require the
use of the tool btrfs-zero-log to wipeout the log tree (and all data
previously fsynced becoming lost forever).

Fix this by not doing a discard when pinning an extent. The discard will
be done later when it's safe (after the new super block is committed) at
extent-tree.c:btrfs_finish_extent_commit().

Fixes: e688b7252f78 (Btrfs: fix extent pinning bugs in the tree log)
CC: <stable@vger.kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d8ce3cfe7ae0..62fa6049e5b6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6955,12 +6955,11 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 		return -ENOSPC;
 	}
 
-	if (btrfs_test_opt(root, DISCARD))
-		ret = btrfs_discard_extent(root, start, len, NULL);
-
 	if (pin)
 		pin_down_extent(root, cache, start, len, 1);
 	else {
+		if (btrfs_test_opt(root, DISCARD))
+			ret = btrfs_discard_extent(root, start, len, NULL);
 		btrfs_add_free_space(cache, start, len);
 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
 	}

From 3c3b04d10ff1811a27f86684ccd2f5ba6983211d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 25 Mar 2015 19:26:41 +0100
Subject: [PATCH 46/93] btrfs: don't accept bare namespace as a valid xattr

Due to insufficient check in btrfs_is_valid_xattr, this unexpectedly
works:

 $ touch file
 $ setfattr -n user. -v 1 file
 $ getfattr -d file
user.="1"

ie. the missing attribute name after the namespace.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=94291
Reported-by: William Douglas <william.douglas@intel.com>
CC: <stable@vger.kernel.org> # 2.6.29+
Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/xattr.c | 53 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 883b93623bc5..45ea704be030 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
 /*
  * Check if the attribute is in a supported namespace.
  *
- * This applied after the check for the synthetic attributes in the system
+ * This is applied after the check for the synthetic attributes in the system
  * namespace.
  */
-static bool btrfs_is_valid_xattr(const char *name)
+static int btrfs_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_SECURITY_PREFIX,
-			XATTR_SECURITY_PREFIX_LEN) ||
-	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
-	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
-	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
-		!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
+	int len = strlen(name);
+	int prefixlen = 0;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN))
+		prefixlen = XATTR_SECURITY_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+		prefixlen = XATTR_USER_PREFIX_LEN;
+	else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+		prefixlen = XATTR_BTRFS_PREFIX_LEN;
+	else
+		return -EOPNOTSUPP;
+
+	/*
+	 * The name cannot consist of just prefix
+	 */
+	if (len <= prefixlen)
+		return -EINVAL;
+
+	return 0;
 }
 
 ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 		       void *buffer, size_t size)
 {
+	int ret;
+
 	/*
 	 * If this is a request for a synthetic attribute in the system.*
 	 * namespace use the generic infrastructure to resolve a handler
@@ -388,8 +408,9 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 		return generic_getxattr(dentry, name, buffer, size);
 
-	if (!btrfs_is_valid_xattr(name))
-		return -EOPNOTSUPP;
+	ret = btrfs_is_valid_xattr(name);
+	if (ret)
+		return ret;
 	return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
 }
 
@@ -397,6 +418,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+	int ret;
 
 	/*
 	 * The permission on security.* and system.* is not checked
@@ -413,8 +435,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 		return generic_setxattr(dentry, name, value, size, flags);
 
-	if (!btrfs_is_valid_xattr(name))
-		return -EOPNOTSUPP;
+	ret = btrfs_is_valid_xattr(name);
+	if (ret)
+		return ret;
 
 	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
 		return btrfs_set_prop(dentry->d_inode, name,
@@ -430,6 +453,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
 	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+	int ret;
 
 	/*
 	 * The permission on security.* and system.* is not checked
@@ -446,8 +470,9 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 		return generic_removexattr(dentry, name);
 
-	if (!btrfs_is_valid_xattr(name))
-		return -EOPNOTSUPP;
+	ret = btrfs_is_valid_xattr(name);
+	if (ret)
+		return ret;
 
 	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
 		return btrfs_set_prop(dentry->d_inode, name,

From 26e726afe01c1c82072cf23a5ed89ce25f39d9f2 Mon Sep 17 00:00:00 2001
From: Chengyu Song <csong84@gatech.edu>
Date: Tue, 24 Mar 2015 18:12:56 -0400
Subject: [PATCH 47/93] btrfs: incorrect handling for fiemap_fill_next_extent
 return

fiemap_fill_next_extent returns 0 on success, -errno on error, 1 if this was
the last extent that will fit in user array. If 1 is returned, the return
value may eventually returned to user space, which should not happen, according
to manpage of ioctl.

Signed-off-by: Chengyu Song <csong84@gatech.edu>
Reviewed-by: David Sterba <dsterba@suse.cz>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d688cfe5d496..782f3bc4651d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4514,8 +4514,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		}
 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
 					      em_len, flags);
-		if (ret)
+		if (ret) {
+			if (ret == 1)
+				ret = 0;
 			goto out_free;
+		}
 	}
 out_free:
 	free_extent_map(em);

From d8620958296e4fa61afde421f1de16a5c2234b28 Mon Sep 17 00:00:00 2001
From: Tom Van Braeckel <tomvanbraeckel@gmail.com>
Date: Tue, 24 Mar 2015 16:35:49 +0100
Subject: [PATCH 48/93] btrfs: explicitly set control file's private_data

The private_data member of the Btrfs control device file
(/dev/btrfs-control) is used to hold the current transaction and needs
to be initialized to NULL to signify that no transaction is in progress.

We explicitly set the control file's private_data to NULL to be
independent of whatever value the misc subsystem initializes it to.

Backstory:
----------

The misc subsystem (which is used by /dev/btrfs-control) initializes
a file's private_data to point to the misc device when a driver has
registered a custom open file operation and initializes it to NULL
when a custom open file operation has *not* been provided.

This subtle quirk is confusing, to the point where kernel code registers
*empty* file open operations to have private_data point to the misc
device structure.

And it leads to bugs, where the addition or removal of a custom open
file operation surprisingly changes the initial contents of a file's
private_data structure.

To simplify things in the misc subsystem, a patch [1] has been proposed
to *always* set private_data to point to the misc device instead of
only doing this when a custom open file operation has been registered.

But before we can fix this in the misc subsystem itself, we need to
modify the (few) drivers that rely on this very subtle behavior.

[1] https://lkml.org/lkml/2014/12/4/939

Signed-off-by: Martin Kepplinger <martink@posteo.de>
Signed-off-by: Tom Van Braeckel <tomvanbraeckel@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db7d7b803a61..eb67ca44ad51 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1908,6 +1908,17 @@ static struct file_system_type btrfs_fs_type = {
 };
 MODULE_ALIAS_FS("btrfs");
 
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * The control file's private_data is used to hold the
+	 * transaction when it is started and is used to keep
+	 * track of whether a transaction is already in progress.
+	 */
+	file->private_data = NULL;
+	return 0;
+}
+
 /*
  * used by btrfsctl to scan devices when no FS is mounted
  */
@@ -2009,6 +2020,7 @@ static const struct super_operations btrfs_super_ops = {
 };
 
 static const struct file_operations btrfs_ctl_fops = {
+	.open = btrfs_control_open,
 	.unlocked_ioctl	 = btrfs_control_ioctl,
 	.compat_ioctl = btrfs_control_ioctl,
 	.owner	 = THIS_MODULE,

From 727b9784b6085c99c2f836bf4fcc2848dc9cf904 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 20 Mar 2015 14:02:09 -0400
Subject: [PATCH 49/93] btrfs: cleanup orphans while looking up default
 subvolume

Orphans in the fs tree are cleaned up via open_ctree and subvolume
orphans are cleaned via btrfs_lookup_dentry -- except when a default
subvolume is in use.  The name for the default subvolume uses a manual
lookup that doesn't trigger orphan cleanup and needs to trigger it
manually as well. This doesn't apply to the remount case since the
subvolumes are cleaned up by walking the root radix tree.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eb67ca44ad51..f2c9f9db3b19 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -901,6 +901,15 @@ static struct dentry *get_default_root(struct super_block *sb,
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
+	if (!(sb->s_flags & MS_RDONLY)) {
+		int ret;
+		down_read(&fs_info->cleanup_work_sem);
+		ret = btrfs_orphan_cleanup(new_root);
+		up_read(&fs_info->cleanup_work_sem);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
 	dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
 	location.objectid = dir_id;

From e56a951e01bf55f49533c47ad2ce61dbd613a3f3 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 17 Mar 2015 14:34:16 +0800
Subject: [PATCH 50/93] Btrfs: Remove the check for old-style mkfs

This was used to make sure that a fresh btrfs from an older mkfs.btrfs,
but it also allows us to mount a buggy btrfs if this btrfs has the right
superblock head part but has something wrong with chunk tree part[1], and
after that we can hit BUG_ON()s set in the code to prevent something
impossible.

Since David has released "Btrfs progs v3.19-rc2", just remove the check,
if anyone who wants to make a fresh btrfs, please use the latest one.

[1]: http://www.spinics.net/lists/linux-btrfs/msg42358.html

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Omar Sandoval <osandov@osandov.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 23c49ab2de4c..568cc4e3d80e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -419,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
 
 		if (memcmp(raw_disk_sb, result, csum_size))
 			ret = 1;
-
-		if (ret && btrfs_super_generation(disk_sb) < 10) {
-			printk(KERN_WARNING
-				"BTRFS: super block crcs don't match, older mkfs detected\n");
-			ret = 0;
-		}
 	}
 
 	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {

From dd82525956744c3a5e6b7275cd468b6180cc5b72 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 1 Apr 2015 08:36:05 -0700
Subject: [PATCH 51/93] Btrfs: free and unlock our path before
 btrfs_free_and_pin_reserved_extent()

The error handling path for alloc_reserved_tree_block is calling
btrfs_free_and_pin_reserved_extent with a spinning tree lock held.  This
might sleep as we allocate extent_state objects:

 BUG: sleeping function called from invalid context at mm/slub.c:1268
 in_atomic(): 1, irqs_disabled(): 0, pid: 11093, name: kworker/u4:7
 5 locks held by kworker/u4:7/11093:
  #0:  ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff81091d51>] process_one_work+0x151/0x520
  #1:  ((&work->normal_work)){+.+.+.}, at: [<ffffffff81091d51>] process_one_work+0x151/0x520
  #2:  (sb_internal){++++.+}, at: [<ffffffffa003a70e>] start_transaction+0x43e/0x590 [btrfs]
  #3:  (&head_ref->mutex){+.+...}, at: [<ffffffffa0089f8c>] btrfs_delayed_ref_lock+0x4c/0x240 [btrfs]
  #4:  (btrfs-extent-00){++++..}, at: [<ffffffffa007697b>] btrfs_clear_lock_blocking_rw+0x9b/0x150 [btrfs]
 CPU: 0 PID: 11093 Comm: kworker/u4:7 Tainted: G        W 4.0.0-rc6-default+ #246
 Hardware name: Intel Corporation Santa Rosa platform/Matanzas, BIOS TSRSCRB1.86C.0047.B00.0610170821 10/17/06
 Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
  00000000000004f4 ffff88006dd17848 ffffffff81ab0e3b ffff88006dd17848
  ffff88007a944760 ffff88006dd17868 ffffffff8109d516 ffff88006dd17898
  0000000000000000 ffff88006dd17898 ffffffff8109d5b2 ffffffff81aba2bb
 Call Trace:
  [<ffffffff81ab0e3b>] dump_stack+0x4f/0x6c
  [<ffffffff8109d516>] ___might_sleep+0xf6/0x140
  [<ffffffff8109d5b2>] __might_sleep+0x52/0x90
  [<ffffffff81aba2bb>] ? ftrace_call+0x5/0x34
  [<ffffffff81196363>] kmem_cache_alloc+0x163/0x1b0
  [<ffffffffa0056f31>] ? alloc_extent_state+0x31/0x150 [btrfs]
  [<ffffffffa0056f20>] ? alloc_extent_state+0x20/0x150 [btrfs]
  [<ffffffffa0056f31>] alloc_extent_state+0x31/0x150 [btrfs]
  [<ffffffffa005805b>] __set_extent_bit+0x37b/0x5d0 [btrfs]
  [<ffffffff81aba2bb>] ? ftrace_call+0x5/0x34
  [<ffffffffa005888d>] ? set_extent_bit+0xd/0x30 [btrfs]
  [<ffffffffa00588a3>] set_extent_bit+0x23/0x30 [btrfs]
  [<ffffffffa0058e80>] set_extent_dirty+0x20/0x30 [btrfs]
  [<ffffffffa00195ba>] pin_down_extent+0xaa/0x170 [btrfs]
  [<ffffffffa001d8ef>] __btrfs_free_reserved_extent+0xcf/0x160 [btrfs]
  [<ffffffffa0023856>] btrfs_free_and_pin_reserved_extent+0x16/0x20 [btrfs]
  [<ffffffffa002482a>] __btrfs_run_delayed_refs+0xfca/0x1290 [btrfs]
  [<ffffffffa0026eae>] btrfs_run_delayed_refs+0x6e/0x2e0 [btrfs]
  [<ffffffffa0027378>] delayed_ref_async_start+0x48/0xb0 [btrfs]
  [<ffffffffa006c883>] normal_work_helper+0x83/0x350 [btrfs]
  [<ffffffffa006cd79>] ? btrfs_extent_refs_helper+0x9/0x20 [btrfs]
  [<ffffffffa006cd82>] btrfs_extent_refs_helper+0x12/0x20 [btrfs]
  [<ffffffff81091dcb>] process_one_work+0x1cb/0x520
  [<ffffffff81091d51>] ? process_one_work+0x151/0x520
  [<ffffffff811c7abf>] ? seq_read+0x3f/0x400
  [<ffffffff8109260b>] worker_thread+0x5b/0x4e0
  [<ffffffff81097be2>] ? __kthread_parkme+0x12/0xa0
  [<ffffffff810925b0>] ? rescuer_thread+0x450/0x450
  [<ffffffff81098686>] kthread+0xf6/0x120
  [<ffffffff81098590>] ? flush_kthread_worker+0x1b0/0x1b0
  [<ffffffff81ab8088>] ret_from_fork+0x58/0x90
  [<ffffffff81098590>] ? flush_kthread_worker+0x1b0/0x1b0
 ------------[ cut here ]------------

This changes things to free the path first, which will also unlock the
extent buffer.

Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: Dave Sterba <dsterba@suse.cz>
Tested-by: Dave Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62fa6049e5b6..41e5812c131f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7093,9 +7093,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
 	if (ret) {
+		btrfs_free_path(path);
 		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
 						   root->nodesize);
-		btrfs_free_path(path);
 		return ret;
 	}
 

From 4a3d1caf8a2c16c55424a0768eade54ee0922341 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 13 Mar 2015 01:58:46 -0700
Subject: [PATCH 52/93] fs: btrfs: Add missing include file

Building alpha:allmodconfig fails with

fs/btrfs/inode.c: In function 'check_direct_IO':
fs/btrfs/inode.c:8050:2: error: implicit declaration of function 'iov_iter_alignment'

due to a missing include file.

Fixes: 3737c63e1fb0 ("fs: move struct kiocb to fs.h")
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb088bd8f7f0..97bc1ffc9c7b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"

From 28ed1345a50491d78e1454ad4005dc5d3557a69e Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 17 Dec 2014 09:41:04 -0800
Subject: [PATCH 53/93] btrfs: actively run the delayed refs while deleting
 large files

When we are deleting large files with large extents, we are building up
a huge set of delayed refs for processing.  Truncate isn't checking
often enough to see if we need to back off and process those, or let
a commit proceed.

The end result is long stalls after the rm, and very long commit times.
During the commits, other processes back up waiting to start new
transactions and we get into trouble.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c |  2 ++
 fs/btrfs/inode.c            | 45 +++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.c      |  2 +-
 fs/btrfs/tree-log.c         |  8 +++++--
 4 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 764528a4f6fd..c51482031edf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -235,6 +235,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	/*
 	 * We don't need an orphan item because truncating the free space cache
 	 * will never be split across transactions.
+	 * We don't need to check for -EAGAIN because we're a free space
+	 * cache inode
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 97bc1ffc9c7b..e3fe137fb826 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4197,10 +4197,20 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	int ret;
 	int err = 0;
+	int be_nice = 0;
 	u64 ino = btrfs_ino(inode);
+	u64 bytes_deleted = 0;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
+	/*
+	 * for non-free space inodes and ref cows, we want to back off from
+	 * time to time
+	 */
+	if (!btrfs_is_free_space_inode(inode) &&
+	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+		be_nice = 1;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -4230,6 +4240,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.type = (u8)-1;
 
 search_again:
+	/*
+	 * with a 16K leaf size and 128MB extents, you can actually queue
+	 * up a huge file in a single leaf.  Most of the time that
+	 * bytes_deleted is > 0, it will be huge by the time we get here
+	 */
+	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+		if (btrfs_should_end_transaction(trans, root)) {
+			err = -EAGAIN;
+			goto error;
+		}
+	}
+
+
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
@@ -4376,11 +4399,18 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 		     root == root->fs_info->tree_root)) {
 			btrfs_set_path_blocking(path);
+			bytes_deleted += extent_num_bytes;
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
 						ino, extent_offset, 0);
 			BUG_ON(ret);
+			if (be_nice && pending_del_nr &&
+			    (pending_del_nr % 16 == 0) &&
+			    bytes_deleted > 1024 * 1024) {
+				btrfs_async_run_delayed_refs(root,
+					trans->delayed_ref_updates * 2, 0);
+			}
 		}
 
 		if (found_type == BTRFS_INODE_ITEM_KEY)
@@ -4416,7 +4446,18 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	if (last_size != (u64)-1 &&
 	    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		btrfs_ordered_update_i_size(inode, last_size, NULL);
+
 	btrfs_free_path(path);
+
+	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+		unsigned long updates = trans->delayed_ref_updates;
+		if (updates) {
+			trans->delayed_ref_updates = 0;
+			ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+			if (ret && !err)
+				err = ret;
+		}
+	}
 	return err;
 }
 
@@ -5013,7 +5054,7 @@ void btrfs_evict_inode(struct inode *inode)
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-		if (ret != -ENOSPC)
+		if (ret != -ENOSPC && ret != -EAGAIN)
 			break;
 
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -8582,7 +8623,7 @@ static int btrfs_truncate(struct inode *inode)
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
-		if (ret != -ENOSPC) {
+		if (ret != -ENOSPC && ret != -EAGAIN) {
 			err = ret;
 			break;
 		}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 91c303ac40b6..ba831ee41891 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -718,7 +718,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 	updates = trans->delayed_ref_updates;
 	trans->delayed_ref_updates = 0;
 	if (updates) {
-		err = btrfs_run_delayed_refs(trans, root, updates);
+		err = btrfs_run_delayed_refs(trans, root, updates * 2);
 		if (err) /* Error code will also eval true */
 			return err;
 	}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 016c90fc85db..a089b5944efc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4251,8 +4251,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 					  &BTRFS_I(inode)->runtime_flags);
 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					  &BTRFS_I(inode)->runtime_flags);
-				ret = btrfs_truncate_inode_items(trans, log,
-								 inode, 0, 0);
+				while(1) {
+					ret = btrfs_truncate_inode_items(trans,
+							 log, inode, 0, 0);
+					if (ret != -EAGAIN)
+						break;
+				}
 			}
 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
 					      &BTRFS_I(inode)->runtime_flags) ||

From 1262133b8d6f10f5ca7621cd4cf65ddf6254126a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Feb 2015 07:50:16 -0800
Subject: [PATCH 54/93] Btrfs: account for crcs in delayed ref processing

As we delete large extents, we end up doing huge amounts of COW in order
to delete the corresponding crcs.  This adds accounting so that we keep
track of that space and flushing of delayed refs so that we don't build
up too much delayed crc work.

This helps limit the delayed work that must be done at commit time and
tries to avoid ENOSPC aborts because the crcs eat all the global
reserves.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 22 ++++++++++++++++++--
 fs/btrfs/delayed-ref.h | 10 +++++++++
 fs/btrfs/extent-tree.c | 46 ++++++++++++++++++++++++++++--------------
 fs/btrfs/inode.c       | 25 ++++++++++++++++-------
 fs/btrfs/transaction.c |  4 ++++
 5 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6d16bea94e1c..8f8ed7d20bac 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -489,11 +489,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
  * existing and update must have the same bytenr
  */
 static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+			 struct btrfs_delayed_ref_node *existing,
 			 struct btrfs_delayed_ref_node *update)
 {
 	struct btrfs_delayed_ref_head *existing_ref;
 	struct btrfs_delayed_ref_head *ref;
+	int old_ref_mod;
 
 	existing_ref = btrfs_delayed_node_to_head(existing);
 	ref = btrfs_delayed_node_to_head(update);
@@ -541,7 +543,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 	 * only need the lock for this case cause we could be processing it
 	 * currently, for refs we just added we know we're a-ok.
 	 */
+	old_ref_mod = existing_ref->total_ref_mod;
 	existing->ref_mod += update->ref_mod;
+	existing_ref->total_ref_mod += update->ref_mod;
+
+	/*
+	 * If we are going to from a positive ref mod to a negative or vice
+	 * versa we need to make sure to adjust pending_csums accordingly.
+	 */
+	if (existing_ref->is_data) {
+		if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+			delayed_refs->pending_csums -= existing->num_bytes;
+		if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+			delayed_refs->pending_csums += existing->num_bytes;
+	}
 	spin_unlock(&existing_ref->lock);
 }
 
@@ -605,6 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref->is_data = is_data;
 	head_ref->ref_root = RB_ROOT;
 	head_ref->processing = 0;
+	head_ref->total_ref_mod = count_mod;
 
 	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
@@ -614,7 +630,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
 	if (existing) {
-		update_existing_head_ref(&existing->node, ref);
+		update_existing_head_ref(delayed_refs, &existing->node, ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
@@ -622,6 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
 		head_ref = existing;
 	} else {
+		if (is_data && count_mod < 0)
+			delayed_refs->pending_csums += num_bytes;
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a764e2340d48..5eb0892396d0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -88,6 +88,14 @@ struct btrfs_delayed_ref_head {
 	struct rb_node href_node;
 
 	struct btrfs_delayed_extent_op *extent_op;
+
+	/*
+	 * This is used to track the final ref_mod from all the refs associated
+	 * with this head ref, this is not adjusted as delayed refs are run,
+	 * this is meant to track if we need to do the csum accounting or not.
+	 */
+	int total_ref_mod;
+
 	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
@@ -138,6 +146,8 @@ struct btrfs_delayed_ref_root {
 	/* total number of head nodes ready for processing */
 	unsigned long num_heads_ready;
 
+	u64 pending_csums;
+
 	/*
 	 * set when the tree is flushing before a transaction commit,
 	 * used by the throttling code to decide if new updates need
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41e5812c131f..a6f88eb57b39 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		 * list before we release it.
 		 */
 		if (btrfs_delayed_ref_is_head(ref)) {
+			if (locked_ref->is_data &&
+			    locked_ref->total_ref_mod < 0) {
+				spin_lock(&delayed_refs->lock);
+				delayed_refs->pending_csums -= ref->num_bytes;
+				spin_unlock(&delayed_refs->lock);
+			}
 			btrfs_delayed_ref_unlock(locked_ref);
 			locked_ref = NULL;
 		}
@@ -2626,11 +2632,31 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
 
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+static u64 csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+	u64 csum_size;
+	u64 num_csums_per_leaf;
+	u64 num_csums;
+
+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+	num_csums_per_leaf = div64_u64(csum_size,
+			(u64)btrfs_super_csum_size(root->fs_info->super_copy));
+	num_csums = div64_u64(csum_bytes, root->sectorsize);
+	num_csums += num_csums_per_leaf - 1;
+	num_csums = div64_u64(num_csums, num_csums_per_leaf);
+	return num_csums;
+}
+
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root)
 {
 	struct btrfs_block_rsv *global_rsv;
 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
 	u64 num_bytes;
 	int ret = 0;
 
@@ -2639,6 +2665,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 	if (num_heads > 1)
 		num_bytes += (num_heads - 1) * root->nodesize;
 	num_bytes <<= 1;
+	num_bytes += csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
 	global_rsv = &root->fs_info->global_block_rsv;
 
 	/*
@@ -5065,30 +5092,19 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 				   int reserve)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 csum_size;
-	int num_csums_per_leaf;
-	int num_csums;
-	int old_csums;
+	u64 old_csums, num_csums;
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
 	    BTRFS_I(inode)->csum_bytes == 0)
 		return 0;
 
-	old_csums = (int)div_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	old_csums = csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+
 	if (reserve)
 		BTRFS_I(inode)->csum_bytes += num_bytes;
 	else
 		BTRFS_I(inode)->csum_bytes -= num_bytes;
-	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
-	num_csums_per_leaf = (int)div_u64(csum_size,
-					    sizeof(struct btrfs_csum_item) +
-					    sizeof(struct btrfs_disk_key));
-	num_csums = (int)div_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
-	num_csums = num_csums + num_csums_per_leaf - 1;
-	num_csums = num_csums / num_csums_per_leaf;
-
-	old_csums = old_csums + num_csums_per_leaf - 1;
-	old_csums = old_csums / num_csums_per_leaf;
+	num_csums = csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
 
 	/* No change, no need to reserve more */
 	if (old_csums == num_csums)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e3fe137fb826..cec23cf812ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4197,9 +4197,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	int ret;
 	int err = 0;
-	int be_nice = 0;
 	u64 ino = btrfs_ino(inode);
 	u64 bytes_deleted = 0;
+	bool be_nice = 0;
+	bool should_throttle = 0;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
@@ -4405,19 +4406,20 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 						btrfs_header_owner(leaf),
 						ino, extent_offset, 0);
 			BUG_ON(ret);
-			if (be_nice && pending_del_nr &&
-			    (pending_del_nr % 16 == 0) &&
-			    bytes_deleted > 1024 * 1024) {
+			if (btrfs_should_throttle_delayed_refs(trans, root))
 				btrfs_async_run_delayed_refs(root,
 					trans->delayed_ref_updates * 2, 0);
-			}
 		}
 
 		if (found_type == BTRFS_INODE_ITEM_KEY)
 			break;
 
+		should_throttle =
+			btrfs_should_throttle_delayed_refs(trans, root);
+
 		if (path->slots[0] == 0 ||
-		    path->slots[0] != pending_del_slot) {
+		    path->slots[0] != pending_del_slot ||
+		    (be_nice && should_throttle)) {
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -4430,6 +4432,15 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				pending_del_nr = 0;
 			}
 			btrfs_release_path(path);
+			if (be_nice && should_throttle) {
+				unsigned long updates = trans->delayed_ref_updates;
+				if (updates) {
+					trans->delayed_ref_updates = 0;
+					ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+					if (ret && !err)
+						err = ret;
+				}
+			}
 			goto search_again;
 		} else {
 			path->slots[0]--;
@@ -4449,7 +4460,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	btrfs_free_path(path);
 
-	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+	if (be_nice && btrfs_should_throttle_delayed_refs(trans, root)) {
 		unsigned long updates = trans->delayed_ref_updates;
 		if (updates) {
 			trans->delayed_ref_updates = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ba831ee41891..8b9eea8f2406 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
 		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+		if (transaction->delayed_refs.pending_csums)
+			printk(KERN_ERR "pending csums is %llu\n",
+			       transaction->delayed_refs.pending_csums);
 		while (!list_empty(&transaction->pending_chunks)) {
 			struct extent_map *em;
 
@@ -223,6 +226,7 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 	cur_trans->delayed_refs.href_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 	cur_trans->delayed_refs.num_heads_ready = 0;
+	cur_trans->delayed_refs.pending_csums = 0;
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;

From 28f75a0e6cdfbce8115487ecbc0968a2c4e01806 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 4 Feb 2015 06:59:29 -0800
Subject: [PATCH 55/93] Btrfs: refill block reserves during truncate

When truncate starts, it allocates some space in the block reserves so
that we'll have enough to update metadata along the way.

For very large files, we can easily go through all of that space as we
loop through the extents.  This changes truncate to refill the space
reservation as it progresses through the file.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/extent-tree.c |  9 ++++-----
 fs/btrfs/inode.c       | 45 ++++++++++++++++++++++++++++++++++++------
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 95944b81ed5c..6bf16d5134c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3297,6 +3297,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 }
 
 /* extent-tree.c */
+
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 						 unsigned num_items)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a6f88eb57b39..75f4bed6e6db 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2636,7 +2636,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
  * would require to store the csums for that many bytes.
  */
-static u64 csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
 {
 	u64 csum_size;
 	u64 num_csums_per_leaf;
@@ -2665,7 +2665,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 	if (num_heads > 1)
 		num_bytes += (num_heads - 1) * root->nodesize;
 	num_bytes <<= 1;
-	num_bytes += csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+	num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
 	global_rsv = &root->fs_info->global_block_rsv;
 
 	/*
@@ -5098,13 +5098,12 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
 	    BTRFS_I(inode)->csum_bytes == 0)
 		return 0;
 
-	old_csums = csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
-
+	old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
 	if (reserve)
 		BTRFS_I(inode)->csum_bytes += num_bytes;
 	else
 		BTRFS_I(inode)->csum_bytes -= num_bytes;
-	num_csums = csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+	num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
 
 	/* No change, no need to reserve more */
 	if (old_csums == num_csums)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cec23cf812ee..88537c52e114 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4163,6 +4163,21 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return err;
 }
 
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytes_deleted)
+{
+	int ret;
+
+	bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+	ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+	if (!ret)
+		trans->bytes_reserved += bytes_deleted;
+	return ret;
+
+}
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -4201,6 +4216,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 bytes_deleted = 0;
 	bool be_nice = 0;
 	bool should_throttle = 0;
+	bool should_end = 0;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
@@ -4396,6 +4412,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		} else {
 			break;
 		}
+		should_throttle = 0;
+
 		if (found_extent &&
 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 		     root == root->fs_info->tree_root)) {
@@ -4409,17 +4427,24 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			if (btrfs_should_throttle_delayed_refs(trans, root))
 				btrfs_async_run_delayed_refs(root,
 					trans->delayed_ref_updates * 2, 0);
+			if (be_nice) {
+				if (truncate_space_check(trans, root,
+							 extent_num_bytes)) {
+					should_end = 1;
+				}
+				if (btrfs_should_throttle_delayed_refs(trans,
+								       root)) {
+					should_throttle = 1;
+				}
+			}
 		}
 
 		if (found_type == BTRFS_INODE_ITEM_KEY)
 			break;
 
-		should_throttle =
-			btrfs_should_throttle_delayed_refs(trans, root);
-
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot ||
-		    (be_nice && should_throttle)) {
+		    should_throttle || should_end) {
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -4432,7 +4457,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				pending_del_nr = 0;
 			}
 			btrfs_release_path(path);
-			if (be_nice && should_throttle) {
+			if (should_throttle) {
 				unsigned long updates = trans->delayed_ref_updates;
 				if (updates) {
 					trans->delayed_ref_updates = 0;
@@ -4441,6 +4466,14 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 						err = ret;
 				}
 			}
+			/*
+			 * if we failed to refill our space rsv, bail out
+			 * and let the transaction restart
+			 */
+			if (should_end) {
+				err = -EAGAIN;
+				goto error;
+			}
 			goto search_again;
 		} else {
 			path->slots[0]--;
@@ -4460,7 +4493,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	btrfs_free_path(path);
 
-	if (be_nice && btrfs_should_throttle_delayed_refs(trans, root)) {
+	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
 		unsigned long updates = trans->delayed_ref_updates;
 		if (updates) {
 			trans->delayed_ref_updates = 0;

From cb723e491955ac11a1591ae25cada7c3b1470609 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 18 Feb 2015 08:06:57 -0800
Subject: [PATCH 56/93] Btrfs: reserve space for block groups

This changes our delayed refs calculations to include the space needed
to write back dirty block groups.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 12 +++++++++---
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/transaction.h |  1 +
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 75f4bed6e6db..ae8db3ba467c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2657,7 +2657,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *global_rsv;
 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
 	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
-	u64 num_bytes;
+	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+	u64 num_bytes, num_dirty_bgs_bytes;
 	int ret = 0;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2666,17 +2667,21 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 		num_bytes += (num_heads - 1) * root->nodesize;
 	num_bytes <<= 1;
 	num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+							     num_dirty_bgs);
 	global_rsv = &root->fs_info->global_block_rsv;
 
 	/*
 	 * If we can't allocate any more chunks lets make sure we have _lots_ of
 	 * wiggle room since running delayed refs can create more delayed refs.
 	 */
-	if (global_rsv->space_info->full)
+	if (global_rsv->space_info->full) {
+		num_dirty_bgs_bytes <<= 1;
 		num_bytes <<= 1;
+	}
 
 	spin_lock(&global_rsv->lock);
-	if (global_rsv->reserved <= num_bytes)
+	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
 		ret = 1;
 	spin_unlock(&global_rsv->lock);
 	return ret;
@@ -5408,6 +5413,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (list_empty(&cache->dirty_list)) {
 			list_add_tail(&cache->dirty_list,
 				      &trans->transaction->dirty_bgs);
+				trans->transaction->num_dirty_bgs++;
 			btrfs_get_block_group(cache);
 		}
 		spin_unlock(&trans->transaction->dirty_bgs_lock);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8b9eea8f2406..234d6063bbf3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -251,6 +251,7 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
 	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+	cur_trans->num_dirty_bgs = 0;
 	spin_lock_init(&cur_trans->dirty_bgs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 96b189b8898a..4cb0ae264534 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,6 +64,7 @@ struct btrfs_transaction {
 	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
+	u64 num_dirty_bgs;
 	spinlock_t dirty_bgs_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;

From 365c5313776730acf433d54226f0216a1d075e9a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 18 Feb 2015 13:58:15 -0800
Subject: [PATCH 57/93] Btrfs: don't commit the transaction in the async space
 flushing

We're triggering a huge number of commits from
btrfs_async_reclaim_metadata_space.  These aren't really requried,
because everyone calling the async reclaim code is going to end up
triggering a commit on their own.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ae8db3ba467c..3d4b3d680fea 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4329,8 +4329,13 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
 					struct btrfs_fs_info *fs_info, u64 used)
 {
-	return (used >= div_factor_fine(space_info->total_bytes, 98) &&
-		!btrfs_fs_closing(fs_info) &&
+	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+	/* If we're just plain full then async reclaim just slows us down. */
+	if (space_info->bytes_used >= thresh)
+		return 0;
+
+	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
 
@@ -4385,10 +4390,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		if (!btrfs_need_do_async_reclaim(space_info, fs_info,
 						 flush_state))
 			return;
-	} while (flush_state <= COMMIT_TRANS);
-
-	if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
-		queue_work(system_unbound_wq, work);
+	} while (flush_state < COMMIT_TRANS);
 }
 
 void btrfs_init_async_reclaim_work(struct work_struct *work)

From 3bce876fd58a745b8a1bc0bd8325c3e5b4cebeb0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 24 Feb 2015 12:35:51 -0800
Subject: [PATCH 58/93] Btrfs: don't steal from the global reserve if we don't
 have the space

btrfs_evict_inode() needs to be more careful about stealing from the
global_rsv.  We dont' want to end up aborting commit with ENOSPC just
because the evict_inode code was too greedy.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 88537c52e114..141df0ce6f93 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5010,6 +5010,7 @@ void btrfs_evict_inode(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv, *global_rsv;
+	int steal_from_global = 0;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	int ret;
 
@@ -5077,9 +5078,20 @@ void btrfs_evict_inode(struct inode *inode)
 		 * hard as possible to get this to work.
 		 */
 		if (ret)
-			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+			steal_from_global++;
+		else
+			steal_from_global = 0;
+		ret = 0;
 
-		if (ret) {
+		/*
+		 * steal_from_global == 0: we reserved stuff, hooray!
+		 * steal_from_global == 1: we didn't reserve stuff, boo!
+		 * steal_from_global == 2: we've committed, still not a lot of
+		 * room but maybe we'll have room in the global reserve this
+		 * time.
+		 * steal_from_global == 3: abandon all hope!
+		 */
+		if (steal_from_global > 2) {
 			btrfs_warn(root->fs_info,
 				"Could not get space for a delete, will truncate on mount %d",
 				ret);
@@ -5095,6 +5107,36 @@ void btrfs_evict_inode(struct inode *inode)
 			goto no_delete;
 		}
 
+		/*
+		 * We can't just steal from the global reserve, we need tomake
+		 * sure there is room to do it, if not we need to commit and try
+		 * again.
+		 */
+		if (steal_from_global) {
+			if (!btrfs_check_space_for_delayed_refs(trans, root))
+				ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+							      min_size);
+			else
+				ret = -ENOSPC;
+		}
+
+		/*
+		 * Couldn't steal from the global reserve, we have too much
+		 * pending stuff built up, commit the transaction and try it
+		 * again.
+		 */
+		if (ret) {
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret) {
+				btrfs_orphan_del(NULL, inode);
+				btrfs_free_block_rsv(root, rsv);
+				goto no_delete;
+			}
+			continue;
+		} else {
+			steal_from_global = 0;
+		}
+
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);

From 4c6d1d85ad89fd8e32dc9204b7f944854399bda9 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Mon, 6 Apr 2015 13:17:20 -0700
Subject: [PATCH 59/93] btrfs: move struct io_ctl into ctree.h and rename it

We'll need to put the io_ctl into the block_group cache struct, so
name it struct btrfs_io_ctl and move it into ctree.h

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            | 11 ++++++++
 fs/btrfs/free-space-cache.c | 55 +++++++++++++++----------------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6bf16d5134c5..e305ccd731fe 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1256,6 +1256,17 @@ struct btrfs_caching_control {
 	atomic_t count;
 };
 
+struct btrfs_io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_root *root;
+	unsigned long size;
+	int index;
+	int num_pages;
+	unsigned check_crcs:1;
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c51482031edf..47c2adb3ddf3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -271,18 +271,7 @@ static int readahead_cache(struct inode *inode)
 	return 0;
 }
 
-struct io_ctl {
-	void *cur, *orig;
-	struct page *page;
-	struct page **pages;
-	struct btrfs_root *root;
-	unsigned long size;
-	int index;
-	int num_pages;
-	unsigned check_crcs:1;
-};
-
-static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 		       struct btrfs_root *root, int write)
 {
 	int num_pages;
@@ -298,7 +287,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
 		return -ENOSPC;
 
-	memset(io_ctl, 0, sizeof(struct io_ctl));
+	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
 
 	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 	if (!io_ctl->pages)
@@ -311,12 +300,12 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 	return 0;
 }
 
-static void io_ctl_free(struct io_ctl *io_ctl)
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
 	kfree(io_ctl->pages);
 }
 
-static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 {
 	if (io_ctl->cur) {
 		kunmap(io_ctl->page);
@@ -325,7 +314,7 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 	}
 }
 
-static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 {
 	ASSERT(io_ctl->index < io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
@@ -336,7 +325,7 @@ static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
 }
 
-static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 {
 	int i;
 
@@ -351,7 +340,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
 	}
 }
 
-static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 				int uptodate)
 {
 	struct page *page;
@@ -385,7 +374,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
 	return 0;
 }
 
-static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 {
 	__le64 *val;
 
@@ -408,7 +397,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 	io_ctl->cur += sizeof(u64);
 }
 
-static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 {
 	__le64 *gen;
 
@@ -437,7 +426,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 	return 0;
 }
 
-static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 {
 	u32 *tmp;
 	u32 crc = ~(u32)0;
@@ -461,7 +450,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
 	kunmap(io_ctl->pages[0]);
 }
 
-static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 {
 	u32 *tmp, val;
 	u32 crc = ~(u32)0;
@@ -494,7 +483,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
 	return 0;
 }
 
-static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
 			    void *bitmap)
 {
 	struct btrfs_free_space_entry *entry;
@@ -524,7 +513,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
 	return 0;
 }
 
-static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 {
 	if (!io_ctl->cur)
 		return -ENOSPC;
@@ -547,7 +536,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 	return 0;
 }
 
-static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 {
 	/*
 	 * If we're not on the boundary we know we've modified the page and we
@@ -564,7 +553,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
 	}
 }
 
-static int io_ctl_read_entry(struct io_ctl *io_ctl,
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
 			    struct btrfs_free_space *entry, u8 *type)
 {
 	struct btrfs_free_space_entry *e;
@@ -591,7 +580,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
 	return 0;
 }
 
-static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 			      struct btrfs_free_space *entry)
 {
 	int ret;
@@ -650,7 +639,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
-	struct io_ctl io_ctl;
+	struct btrfs_io_ctl io_ctl;
 	struct btrfs_key key;
 	struct btrfs_free_space *e, *n;
 	LIST_HEAD(bitmaps);
@@ -879,7 +868,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 }
 
 static noinline_for_stack
-int write_cache_extent_entries(struct io_ctl *io_ctl,
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 			      struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_block_group_cache *block_group,
 			      int *entries, int *bitmaps,
@@ -1002,7 +991,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
 static noinline_for_stack int
 write_pinned_extent_entries(struct btrfs_root *root,
 			    struct btrfs_block_group_cache *block_group,
-			    struct io_ctl *io_ctl,
+			    struct btrfs_io_ctl *io_ctl,
 			    int *entries)
 {
 	u64 start, extent_start, extent_end, len;
@@ -1052,7 +1041,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
 }
 
 static noinline_for_stack int
-write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
 {
 	struct list_head *pos, *n;
 	int ret;
@@ -1086,7 +1075,7 @@ static int flush_dirty_cache(struct inode *inode)
 
 static void noinline_for_stack
 cleanup_write_cache_enospc(struct inode *inode,
-			   struct io_ctl *io_ctl,
+			   struct btrfs_io_ctl *io_ctl,
 			   struct extent_state **cached_state,
 			   struct list_head *bitmap_list)
 {
@@ -1123,7 +1112,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				   struct btrfs_path *path, u64 offset)
 {
 	struct extent_state *cached_state = NULL;
-	struct io_ctl io_ctl;
+	struct btrfs_io_ctl io_ctl;
 	LIST_HEAD(bitmap_list);
 	int entries = 0;
 	int bitmaps = 0;

From c9dc4c6578502c2085705347375b82089aad18d0 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Sat, 4 Apr 2015 17:14:42 -0700
Subject: [PATCH 60/93] Btrfs: two stage dirty block group writeout

Block group cache writeout is currently waiting on the pages for each
block group cache before moving on to writing the next one.  This commit
switches things around to send down all the caches and then wait on them
in batches.

The end result is much faster, since we're keeping the disk pipeline
full.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |   6 ++
 fs/btrfs/extent-tree.c      |  57 ++++++++++++++--
 fs/btrfs/free-space-cache.c | 131 ++++++++++++++++++++++++++++--------
 fs/btrfs/free-space-cache.h |   8 ++-
 4 files changed, 170 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e305ccd731fe..1df0d9db5332 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1261,9 +1261,12 @@ struct btrfs_io_ctl {
 	struct page *page;
 	struct page **pages;
 	struct btrfs_root *root;
+	struct inode *inode;
 	unsigned long size;
 	int index;
 	int num_pages;
+	int entries;
+	int bitmaps;
 	unsigned check_crcs:1;
 };
 
@@ -1332,6 +1335,9 @@ struct btrfs_block_group_cache {
 
 	/* For dirty block groups */
 	struct list_head dirty_list;
+	struct list_head io_list;
+
+	struct btrfs_io_ctl io_ctl;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d4b3d680fea..40c95135d037 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3388,7 +3388,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	int ret = 0;
+	int should_put;
 	struct btrfs_path *path;
+	LIST_HEAD(io);
+	int num_started = 0;
+	int num_waited = 0;
 
 	if (list_empty(&cur_trans->dirty_bgs))
 		return 0;
@@ -3407,16 +3411,60 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		cache = list_first_entry(&cur_trans->dirty_bgs,
 					 struct btrfs_block_group_cache,
 					 dirty_list);
+
+		/*
+		 * this can happen if cache_save_setup re-dirties a block
+		 * group that is already under IO.  Just wait for it to
+		 * finish and then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(root, trans, cache,
+					    &cache->io_ctl, path,
+					    cache->key.objectid);
+			btrfs_put_block_group(cache);
+			num_waited++;
+		}
+
 		list_del_init(&cache->dirty_list);
+		should_put = 1;
+
 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
 			cache_save_setup(cache, trans, path);
+
 		if (!ret)
-			ret = btrfs_run_delayed_refs(trans, root,
-						     (unsigned long) -1);
-		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
-			btrfs_write_out_cache(root, trans, cache, path);
+			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(root, trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+				list_add_tail(&cache->io_list, &io);
+			} else {
+				/*
+				 * if we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
 		if (!ret)
 			ret = write_one_cache_group(trans, root, path, cache);
+
+		/* if its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+	}
+
+	while (!list_empty(&io)) {
+		cache = list_first_entry(&io, struct btrfs_block_group_cache,
+					 io_list);
+		list_del_init(&cache->io_list);
+		num_waited++;
+		btrfs_wait_cache_io(root, trans, cache,
+				    &cache->io_ctl, path, cache->key.objectid);
 		btrfs_put_block_group(cache);
 	}
 
@@ -9013,6 +9061,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	INIT_LIST_HEAD(&cache->bg_list);
 	INIT_LIST_HEAD(&cache->ro_list);
 	INIT_LIST_HEAD(&cache->dirty_list);
+	INIT_LIST_HEAD(&cache->io_list);
 	btrfs_init_free_space_ctl(cache);
 	atomic_set(&cache->trimming, 0);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 47c2adb3ddf3..6886ae063483 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -170,13 +170,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
 	key.type = 0;
-
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(struct btrfs_free_space_header));
 	if (ret < 0) {
 		btrfs_release_path(path);
 		return ret;
 	}
+
 	leaf = path->nodes[0];
 	header = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_free_space_header);
@@ -296,6 +296,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 	io_ctl->num_pages = num_pages;
 	io_ctl->root = root;
 	io_ctl->check_crcs = check_crcs;
+	io_ctl->inode = inode;
 
 	return 0;
 }
@@ -303,6 +304,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
 	kfree(io_ctl->pages);
+	io_ctl->pages = NULL;
 }
 
 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
@@ -1092,6 +1094,61 @@ cleanup_write_cache_enospc(struct inode *inode,
 			     GFP_NOFS);
 }
 
+int btrfs_wait_cache_io(struct btrfs_root *root,
+			struct btrfs_trans_handle *trans,
+			struct btrfs_block_group_cache *block_group,
+			struct btrfs_io_ctl *io_ctl,
+			struct btrfs_path *path, u64 offset)
+{
+	int ret;
+	struct inode *inode = io_ctl->inode;
+
+	root = root->fs_info->tree_root;
+
+	/* Flush the dirty pages in the cache file. */
+	ret = flush_dirty_cache(inode);
+	if (ret)
+		goto out;
+
+	/* Update the cache item to tell everyone this cache file is valid. */
+	ret = update_cache_item(trans, root, inode, path, offset,
+				io_ctl->entries, io_ctl->bitmaps);
+out:
+	io_ctl_free(io_ctl);
+	if (ret) {
+		invalidate_inode_pages2(inode->i_mapping);
+		BTRFS_I(inode)->generation = 0;
+		if (block_group) {
+#ifdef DEBUG
+			btrfs_err(root->fs_info,
+				"failed to write free space cache for block group %llu",
+				block_group->key.objectid);
+#endif
+		}
+	}
+	btrfs_update_inode(trans, root, inode);
+
+	if (block_group) {
+		spin_lock(&block_group->lock);
+
+		/*
+		 * only mark this as written if we didn't get put back on
+		 * the dirty list while waiting for IO.
+		 */
+		if (!ret && list_empty(&block_group->dirty_list))
+			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		else if (ret)
+			block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+		spin_unlock(&block_group->lock);
+		io_ctl->inode = NULL;
+		iput(inode);
+	}
+
+	return ret;
+
+}
+
 /**
  * __btrfs_write_out_cache - write out cached info to an inode
  * @root - the root the inode belongs to
@@ -1108,20 +1165,22 @@ cleanup_write_cache_enospc(struct inode *inode,
 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				   struct btrfs_free_space_ctl *ctl,
 				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_io_ctl *io_ctl,
 				   struct btrfs_trans_handle *trans,
 				   struct btrfs_path *path, u64 offset)
 {
 	struct extent_state *cached_state = NULL;
-	struct btrfs_io_ctl io_ctl;
 	LIST_HEAD(bitmap_list);
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
+	int must_iput = 0;
 
 	if (!i_size_read(inode))
 		return -1;
 
-	ret = io_ctl_init(&io_ctl, inode, root, 1);
+	WARN_ON(io_ctl->pages);
+	ret = io_ctl_init(io_ctl, inode, root, 1);
 	if (ret)
 		return -1;
 
@@ -1134,22 +1193,23 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			up_write(&block_group->data_rwsem);
 			BTRFS_I(inode)->generation = 0;
 			ret = 0;
+			must_iput = 1;
 			goto out;
 		}
 		spin_unlock(&block_group->lock);
 	}
 
 	/* Lock all pages first so we can lock the extent safely. */
-	io_ctl_prepare_pages(&io_ctl, inode, 0);
+	io_ctl_prepare_pages(io_ctl, inode, 0);
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state);
 
-	io_ctl_set_generation(&io_ctl, trans->transid);
+	io_ctl_set_generation(io_ctl, trans->transid);
 
 	mutex_lock(&ctl->cache_writeout_mutex);
 	/* Write out the extent entries in the free space cache */
-	ret = write_cache_extent_entries(&io_ctl, ctl,
+	ret = write_cache_extent_entries(io_ctl, ctl,
 					 block_group, &entries, &bitmaps,
 					 &bitmap_list);
 	if (ret) {
@@ -1162,7 +1222,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * they will be added into free space cache after the transaction is
 	 * committed, we shouldn't lose them.
 	 */
-	ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
 	if (ret) {
 		mutex_unlock(&ctl->cache_writeout_mutex);
 		goto out_nospc;
@@ -1173,16 +1233,16 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * locked while doing it because a concurrent trim can be manipulating
 	 * or freeing the bitmap.
 	 */
-	ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+	ret = write_bitmap_entries(io_ctl, &bitmap_list);
 	mutex_unlock(&ctl->cache_writeout_mutex);
 	if (ret)
 		goto out_nospc;
 
 	/* Zero out the rest of the pages just to make sure */
-	io_ctl_zero_remaining_pages(&io_ctl);
+	io_ctl_zero_remaining_pages(io_ctl);
 
 	/* Everything is written out, now we dirty the pages in the file. */
-	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+	ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
 				0, i_size_read(inode), &cached_state);
 	if (ret)
 		goto out_nospc;
@@ -1193,30 +1253,39 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * Release the pages and unlock the extent, we will flush
 	 * them out later
 	 */
-	io_ctl_drop_pages(&io_ctl);
+	io_ctl_drop_pages(io_ctl);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
 
-	/* Flush the dirty pages in the cache file. */
-	ret = flush_dirty_cache(inode);
+	/*
+	 * at this point the pages are under IO and we're happy,
+	 * The caller is responsible for waiting on them and updating the
+	 * the cache and the inode
+	 */
+	io_ctl->entries = entries;
+	io_ctl->bitmaps = bitmaps;
+
+	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
 	if (ret)
 		goto out;
 
-	/* Update the cache item to tell everyone this cache file is valid. */
-	ret = update_cache_item(trans, root, inode, path, offset,
-				entries, bitmaps);
+	return 0;
+
 out:
-	io_ctl_free(&io_ctl);
+	io_ctl->inode = NULL;
+	io_ctl_free(io_ctl);
 	if (ret) {
 		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
 	btrfs_update_inode(trans, root, inode);
+	if (must_iput)
+		iput(inode);
 	return ret;
 
 out_nospc:
-	cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+	cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
 
 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
 		up_write(&block_group->data_rwsem);
@@ -1232,7 +1301,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct inode *inode;
 	int ret = 0;
-	enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
 
 	root = root->fs_info->tree_root;
 
@@ -1253,22 +1321,28 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	if (IS_ERR(inode))
 		return 0;
 
-	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+	ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+				      &block_group->io_ctl, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
-		dcs = BTRFS_DC_ERROR;
-		ret = 0;
 #ifdef DEBUG
 		btrfs_err(root->fs_info,
 			"failed to write free space cache for block group %llu",
 			block_group->key.objectid);
 #endif
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+
+		block_group->io_ctl.inode = NULL;
+		iput(inode);
 	}
 
-	spin_lock(&block_group->lock);
-	block_group->disk_cache_state = dcs;
-	spin_unlock(&block_group->lock);
-	iput(inode);
+	/*
+	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+	 * to wait for IO and put the inode
+	 */
+
 	return ret;
 }
 
@@ -3331,11 +3405,14 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 {
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
 	int ret;
+	struct btrfs_io_ctl io_ctl;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
 		return 0;
 
-	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+				      trans, path, 0) ||
+		btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 88b2238a0aed..c4339863af05 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,6 +48,8 @@ struct btrfs_free_space_op {
 			   struct btrfs_free_space *info);
 };
 
+struct btrfs_io_ctl;
+
 struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				      struct btrfs_block_group_cache
 				      *block_group, struct btrfs_path *path);
@@ -63,11 +65,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+			struct btrfs_trans_handle *trans,
+			struct btrfs_block_group_cache *block_group,
+			struct btrfs_io_ctl *io_ctl,
+			struct btrfs_path *path, u64 offset);
 int btrfs_write_out_cache(struct btrfs_root *root,
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
 			  struct btrfs_path *path);
-
 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
 				    struct btrfs_path *path);
 int create_free_ino_inode(struct btrfs_root *root,

From 2b108268006e06d57ec9810f4ccf5d99d7e5b598 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Mon, 6 Apr 2015 07:48:20 -0700
Subject: [PATCH 61/93] Btrfs: don't use highmem for free space cache pages

In order to create the free space cache concurrently with FS modifications,
we need to take a few block group locks.

The cache code also does kmap, which would schedule with the locks held.
Instead of going through kmap_atomic, lets just use lowmem for the cache
pages.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6886ae063483..83532a245947 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 	}
 
 	mapping_set_gfp_mask(inode->i_mapping,
-			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+			mapping_gfp_mask(inode->i_mapping) &
+			~(GFP_NOFS & ~__GFP_HIGHMEM));
 
 	return inode;
 }
@@ -310,7 +311,6 @@ static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 {
 	if (io_ctl->cur) {
-		kunmap(io_ctl->page);
 		io_ctl->cur = NULL;
 		io_ctl->orig = NULL;
 	}
@@ -320,7 +320,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 {
 	ASSERT(io_ctl->index < io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
-	io_ctl->cur = kmap(io_ctl->page);
+	io_ctl->cur = page_address(io_ctl->page);
 	io_ctl->orig = io_ctl->cur;
 	io_ctl->size = PAGE_CACHE_SIZE;
 	if (clear)
@@ -446,10 +446,9 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	io_ctl_unmap_page(io_ctl);
-	tmp = kmap(io_ctl->pages[0]);
+	tmp = page_address(io_ctl->pages[0]);
 	tmp += index;
 	*tmp = crc;
-	kunmap(io_ctl->pages[0]);
 }
 
 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
@@ -466,10 +465,9 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 	if (index == 0)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
-	tmp = kmap(io_ctl->pages[0]);
+	tmp = page_address(io_ctl->pages[0]);
 	tmp += index;
 	val = *tmp;
-	kunmap(io_ctl->pages[0]);
 
 	io_ctl_map_page(io_ctl, 0);
 	crc = btrfs_csum_data(io_ctl->orig + offset, crc,

From 1bbc621ef28462456131c035eaeb5567a1a2a2fe Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Mon, 6 Apr 2015 12:46:08 -0700
Subject: [PATCH 62/93] Btrfs: allow block group cache writeout outside
 critical section in commit

We loop through all of the dirty block groups during commit and write
the free space cache.  In order to make sure the cache is currect, we do
this while no other writers are allowed in the commit.

If a large number of block groups are dirty, this can introduce long
stalls during the final stages of the commit, which can block new procs
trying to change the filesystem.

This commit changes the block group cache writeout to take appropriate
locks and allow it to run earlier in the commit.  We'll still have to
redo some of the block groups, but it means we can get most of the work
out of the way without blocking the entire FS.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |   8 ++
 fs/btrfs/disk-io.c          |   1 +
 fs/btrfs/extent-tree.c      | 241 ++++++++++++++++++++++++++++++++----
 fs/btrfs/free-space-cache.c |  69 +++++++++--
 fs/btrfs/free-space-cache.h |   1 +
 fs/btrfs/inode-map.c        |   2 +-
 fs/btrfs/relocation.c       |   9 +-
 fs/btrfs/transaction.c      |  38 +++++-
 fs/btrfs/transaction.h      |   9 ++
 9 files changed, 341 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1df0d9db5332..83051fae9467 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1491,6 +1491,12 @@ struct btrfs_fs_info {
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
 
+	/*
+	 * this is taken to make sure we don't set block groups ro after
+	 * the free space cache has been allocated on them
+	 */
+	struct mutex ro_block_group_mutex;
+
 	/* this is used during read/modify/write to make sure
 	 * no two ios are trying to mod the same stripe at the same
 	 * time
@@ -3407,6 +3413,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 owner, u64 offset, int no_quota);
 
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 568cc4e3d80e..b5e3d5f6400a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2572,6 +2572,7 @@ int open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	mutex_init(&fs_info->ro_block_group_mutex);
 	init_rwsem(&fs_info->commit_root_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 40c95135d037..02c2b29a0840 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3298,7 +3298,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 		if (ret)
 			goto out_put;
 
-		ret = btrfs_truncate_free_space_cache(root, trans, inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
 		if (ret)
 			goto out_put;
 	}
@@ -3382,6 +3382,146 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
+	int should_put;
+	struct btrfs_path *path = NULL;
+	LIST_HEAD(dirty);
+	struct list_head *io = &cur_trans->io_bgs;
+	int num_started = 0;
+	int loops = 0;
+
+	spin_lock(&cur_trans->dirty_bgs_lock);
+	if (!list_empty(&cur_trans->dirty_bgs)) {
+		list_splice_init(&cur_trans->dirty_bgs, &dirty);
+	}
+	spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+	if (list_empty(&dirty)) {
+		btrfs_free_path(path);
+		return 0;
+	}
+
+	/*
+	 * make sure all the block groups on our dirty list actually
+	 * exist
+	 */
+	btrfs_create_pending_block_groups(trans, root);
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	}
+
+	while (!list_empty(&dirty)) {
+		cache = list_first_entry(&dirty,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+
+		/*
+		 * cache_write_mutex is here only to save us from balance
+		 * deleting this block group while we are writing out the
+		 * cache
+		 */
+		mutex_lock(&trans->transaction->cache_write_mutex);
+
+		/*
+		 * this can happen if something re-dirties a block
+		 * group that is already under IO.  Just wait for it to
+		 * finish and then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(root, trans, cache,
+					    &cache->io_ctl, path,
+					    cache->key.objectid);
+			btrfs_put_block_group(cache);
+		}
+
+
+		/*
+		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
+		 * if it should update the cache_state.  Don't delete
+		 * until after we wait.
+		 *
+		 * Since we're not running in the commit critical section
+		 * we need the dirty_bgs_lock to protect from update_block_group
+		 */
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_del_init(&cache->dirty_list);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+
+		should_put = 1;
+
+		cache_save_setup(cache, trans, path);
+
+		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(root, trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+
+				/*
+				 * the cache_write_mutex is protecting
+				 * the io_list
+				 */
+				list_add_tail(&cache->io_list, io);
+			} else {
+				/*
+				 * if we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
+		if (!ret)
+			ret = write_one_cache_group(trans, root, path, cache);
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+
+		/* if its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+
+		if (ret)
+			break;
+	}
+
+	/*
+	 * go through delayed refs for all the stuff we've just kicked off
+	 * and then loop back (just once)
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	if (!ret && loops == 0) {
+		loops++;
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_splice_init(&cur_trans->dirty_bgs, &dirty);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
@@ -3390,12 +3530,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int should_put;
 	struct btrfs_path *path;
-	LIST_HEAD(io);
+	struct list_head *io = &cur_trans->io_bgs;
 	int num_started = 0;
-	int num_waited = 0;
-
-	if (list_empty(&cur_trans->dirty_bgs))
-		return 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -3423,14 +3559,16 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 					    &cache->io_ctl, path,
 					    cache->key.objectid);
 			btrfs_put_block_group(cache);
-			num_waited++;
 		}
 
+		/*
+		 * don't remove from the dirty list until after we've waited
+		 * on any pending IO
+		 */
 		list_del_init(&cache->dirty_list);
 		should_put = 1;
 
-		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-			cache_save_setup(cache, trans, path);
+		cache_save_setup(cache, trans, path);
 
 		if (!ret)
 			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
@@ -3441,7 +3579,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			if (ret == 0 && cache->io_ctl.inode) {
 				num_started++;
 				should_put = 0;
-				list_add_tail(&cache->io_list, &io);
+				list_add_tail(&cache->io_list, io);
 			} else {
 				/*
 				 * if we failed to write the cache, the
@@ -3458,11 +3596,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			btrfs_put_block_group(cache);
 	}
 
-	while (!list_empty(&io)) {
-		cache = list_first_entry(&io, struct btrfs_block_group_cache,
+	while (!list_empty(io)) {
+		cache = list_first_entry(io, struct btrfs_block_group_cache,
 					 io_list);
 		list_del_init(&cache->io_list);
-		num_waited++;
 		btrfs_wait_cache_io(root, trans, cache,
 				    &cache->io_ctl, path, cache->key.objectid);
 		btrfs_put_block_group(cache);
@@ -5459,15 +5596,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
 			cache_block_group(cache, 1);
 
-		spin_lock(&trans->transaction->dirty_bgs_lock);
-		if (list_empty(&cache->dirty_list)) {
-			list_add_tail(&cache->dirty_list,
-				      &trans->transaction->dirty_bgs);
-				trans->transaction->num_dirty_bgs++;
-			btrfs_get_block_group(cache);
-		}
-		spin_unlock(&trans->transaction->dirty_bgs_lock);
-
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5516,6 +5644,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				spin_unlock(&info->unused_bgs_lock);
 			}
 		}
+
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+		if (list_empty(&cache->dirty_list)) {
+			list_add_tail(&cache->dirty_list,
+				      &trans->transaction->dirty_bgs);
+				trans->transaction->num_dirty_bgs++;
+			btrfs_get_block_group(cache);
+		}
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
 		bytenr += num_bytes;
@@ -8602,10 +8740,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
 	BUG_ON(cache->ro);
 
+again:
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
+	/*
+	 * we're not allowed to set block groups readonly after the dirty
+	 * block groups cache has started writing.  If it already started,
+	 * back off and let this transaction commit
+	 */
+	mutex_lock(&root->fs_info->ro_block_group_mutex);
+	if (trans->transaction->dirty_bg_run) {
+		u64 transid = trans->transid;
+
+		mutex_unlock(&root->fs_info->ro_block_group_mutex);
+		btrfs_end_transaction(trans, root);
+
+		ret = btrfs_wait_for_commit(root, transid);
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+
 	ret = set_block_group_ro(cache, 0);
 	if (!ret)
 		goto out;
@@ -8620,6 +8778,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 		alloc_flags = update_block_group_flags(root, cache->flags);
 		check_system_chunk(trans, root, alloc_flags);
 	}
+	mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -9425,7 +9584,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
+	/*
+	 * get the inode first so any iput calls done for the io_list
+	 * aren't the final iput (no unlinks allowed now)
+	 */
 	inode = lookup_free_space_inode(tree_root, block_group, path);
+
+	mutex_lock(&trans->transaction->cache_write_mutex);
+	/*
+	 * make sure our free spache cache IO is done before remove the
+	 * free space inode
+	 */
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (!list_empty(&block_group->io_list)) {
+		list_del_init(&block_group->io_list);
+
+		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		btrfs_wait_cache_io(root, trans, block_group,
+				    &block_group->io_ctl, path,
+				    block_group->key.objectid);
+		btrfs_put_block_group(block_group);
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+	}
+
+	if (!list_empty(&block_group->dirty_list)) {
+		list_del_init(&block_group->dirty_list);
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+	mutex_unlock(&trans->transaction->cache_write_mutex);
+
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
 		if (ret) {
@@ -9518,11 +9708,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	spin_lock(&trans->transaction->dirty_bgs_lock);
 	if (!list_empty(&block_group->dirty_list)) {
-		list_del_init(&block_group->dirty_list);
-		btrfs_put_block_group(block_group);
+		WARN_ON(1);
+	}
+	if (!list_empty(&block_group->io_list)) {
+		WARN_ON(1);
 	}
 	spin_unlock(&trans->transaction->dirty_bgs_lock);
-
 	btrfs_remove_free_space_cache(block_group);
 
 	spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 83532a245947..253cb74b0e27 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -226,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
+				    struct btrfs_block_group_cache *block_group,
 				    struct inode *inode)
 {
 	int ret = 0;
+	struct btrfs_path *path = btrfs_alloc_path();
+
+	if (!path) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	if (block_group) {
+		mutex_lock(&trans->transaction->cache_write_mutex);
+		if (!list_empty(&block_group->io_list)) {
+			list_del_init(&block_group->io_list);
+
+			btrfs_wait_cache_io(root, trans, block_group,
+					    &block_group->io_ctl, path,
+					    block_group->key.objectid);
+			btrfs_put_block_group(block_group);
+		}
+
+		/*
+		 * now that we've truncated the cache away, its no longer
+		 * setup or written
+		 */
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+		spin_unlock(&block_group->lock);
+	}
+	btrfs_free_path(path);
 
 	btrfs_i_size_write(inode, 0);
 	truncate_pagecache(inode, 0);
@@ -242,11 +270,17 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
 	if (ret) {
+		mutex_unlock(&trans->transaction->cache_write_mutex);
 		btrfs_abort_transaction(trans, root, ret);
 		return ret;
 	}
 
 	ret = btrfs_update_inode(trans, root, inode);
+
+	if (block_group)
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+
+fail:
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 
@@ -876,6 +910,7 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 {
 	int ret;
 	struct btrfs_free_cluster *cluster = NULL;
+	struct btrfs_free_cluster *cluster_locked = NULL;
 	struct rb_node *node = rb_first(&ctl->free_space_offset);
 	struct btrfs_trim_range *trim_entry;
 
@@ -887,6 +922,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 	}
 
 	if (!node && cluster) {
+		cluster_locked = cluster;
+		spin_lock(&cluster_locked->lock);
 		node = rb_first(&cluster->root);
 		cluster = NULL;
 	}
@@ -910,9 +947,15 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 		node = rb_next(node);
 		if (!node && cluster) {
 			node = rb_first(&cluster->root);
+			cluster_locked = cluster;
+			spin_lock(&cluster_locked->lock);
 			cluster = NULL;
 		}
 	}
+	if (cluster_locked) {
+		spin_unlock(&cluster_locked->lock);
+		cluster_locked = NULL;
+	}
 
 	/*
 	 * Make sure we don't miss any range that was removed from our rbtree
@@ -930,6 +973,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 
 	return 0;
 fail:
+	if (cluster_locked)
+		spin_unlock(&cluster_locked->lock);
 	return -ENOSPC;
 }
 
@@ -1101,6 +1146,9 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
 	int ret;
 	struct inode *inode = io_ctl->inode;
 
+	if (!inode)
+		return 0;
+
 	root = root->fs_info->tree_root;
 
 	/* Flush the dirty pages in the cache file. */
@@ -1127,11 +1175,16 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
 	btrfs_update_inode(trans, root, inode);
 
 	if (block_group) {
+		/* the dirty list is protected by the dirty_bgs_lock */
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+
+		/* the disk_cache_state is protected by the block group lock */
 		spin_lock(&block_group->lock);
 
 		/*
 		 * only mark this as written if we didn't get put back on
-		 * the dirty list while waiting for IO.
+		 * the dirty list while waiting for IO.   Otherwise our
+		 * cache state won't be right, and we won't get written again
 		 */
 		if (!ret && list_empty(&block_group->dirty_list))
 			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
@@ -1139,6 +1192,7 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
 			block_group->disk_cache_state = BTRFS_DC_ERROR;
 
 		spin_unlock(&block_group->lock);
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
 		io_ctl->inode = NULL;
 		iput(inode);
 	}
@@ -1207,9 +1261,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	mutex_lock(&ctl->cache_writeout_mutex);
 	/* Write out the extent entries in the free space cache */
+	spin_lock(&ctl->tree_lock);
 	ret = write_cache_extent_entries(io_ctl, ctl,
 					 block_group, &entries, &bitmaps,
 					 &bitmap_list);
+	spin_unlock(&ctl->tree_lock);
 	if (ret) {
 		mutex_unlock(&ctl->cache_writeout_mutex);
 		goto out_nospc;
@@ -1219,6 +1275,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * Some spaces that are freed in the current transaction are pinned,
 	 * they will be added into free space cache after the transaction is
 	 * committed, we shouldn't lose them.
+	 *
+	 * If this changes while we are working we'll get added back to
+	 * the dirty list and redo it.  No locking needed
 	 */
 	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
 	if (ret) {
@@ -1231,7 +1290,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * locked while doing it because a concurrent trim can be manipulating
 	 * or freeing the bitmap.
 	 */
+	spin_lock(&ctl->tree_lock);
 	ret = write_bitmap_entries(io_ctl, &bitmap_list);
+	spin_unlock(&ctl->tree_lock);
 	mutex_unlock(&ctl->cache_writeout_mutex);
 	if (ret)
 		goto out_nospc;
@@ -1307,12 +1368,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 		spin_unlock(&block_group->lock);
 		return 0;
 	}
-
-	if (block_group->delalloc_bytes) {
-		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
-		spin_unlock(&block_group->lock);
-		return 0;
-	}
 	spin_unlock(&block_group->lock);
 
 	inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index c4339863af05..a16a029ad3b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -62,6 +62,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
+				    struct btrfs_block_group_cache *block_group,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3a516e..f6a596d5a637 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -456,7 +456,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	}
 
 	if (i_size_read(inode) > 0) {
-		ret = btrfs_truncate_free_space_cache(root, trans, inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
 		if (ret) {
 			if (ret != -ENOSPC)
 				btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d83085381bcc..840a4eb0f396 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc,
 }
 
 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
-				    struct inode *inode, u64 ino)
+				    struct btrfs_block_group_cache *block_group,
+				    struct inode *inode,
+				    u64 ino)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3465,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
-	ret = btrfs_truncate_free_space_cache(root, trans, inode);
+	ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
 
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
@@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc,
 	 */
 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
 		ret = delete_block_group_cache(rc->extent_root->fs_info,
+					       rc->block_group,
 					       NULL, ref_objectid);
 		if (ret != -ENOENT)
 			return ret;
@@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	btrfs_free_path(path);
 
 	if (!IS_ERR(inode))
-		ret = delete_block_group_cache(fs_info, inode, 0);
+		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
 	else
 		ret = PTR_ERR(inode);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 234d6063bbf3..5628e25250c0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -222,6 +222,7 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 	atomic_set(&cur_trans->use_count, 2);
 	cur_trans->have_free_bgs = 0;
 	cur_trans->start_time = get_seconds();
+	cur_trans->dirty_bg_run = 0;
 
 	cur_trans->delayed_refs.href_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -251,6 +252,8 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
 	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+	INIT_LIST_HEAD(&cur_trans->io_bgs);
+	mutex_init(&cur_trans->cache_write_mutex);
 	cur_trans->num_dirty_bgs = 0;
 	spin_lock_init(&cur_trans->dirty_bgs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1059,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+	struct list_head *io_bgs = &trans->transaction->io_bgs;
 	struct list_head *next;
 	struct extent_buffer *eb;
 	int ret;
@@ -1112,7 +1116,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 			return ret;
 	}
 
-	while (!list_empty(dirty_bgs)) {
+	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
 		ret = btrfs_write_dirty_block_groups(trans, root);
 		if (ret)
 			return ret;
@@ -1812,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
+	if (!cur_trans->dirty_bg_run) {
+		int run_it = 0;
+
+		/* this mutex is also taken before trying to set
+		 * block groups readonly.  We need to make sure
+		 * that nobody has set a block group readonly
+		 * after a extents from that block group have been
+		 * allocated for cache files.  btrfs_set_block_group_ro
+		 * will wait for the transaction to commit if it
+		 * finds dirty_bg_run = 1
+		 *
+		 * The dirty_bg_run flag is also used to make sure only
+		 * one process starts all the block group IO.  It wouldn't
+		 * hurt to have more than one go through, but there's no
+		 * real advantage to it either.
+		 */
+		mutex_lock(&root->fs_info->ro_block_group_mutex);
+		if (!cur_trans->dirty_bg_run) {
+			run_it = 1;
+			cur_trans->dirty_bg_run = 1;
+		}
+		mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+		if (run_it)
+			ret = btrfs_start_dirty_block_groups(trans, root);
+	}
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
 	spin_lock(&root->fs_info->trans_lock);
 	list_splice(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
@@ -2005,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	assert_qgroups_uptodate(trans);
 	ASSERT(list_empty(&cur_trans->dirty_bgs));
+	ASSERT(list_empty(&cur_trans->io_bgs));
 	update_super_roots(root);
 
 	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4cb0ae264534..0b24755596ba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,10 +64,19 @@ struct btrfs_transaction {
 	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
+	struct list_head io_bgs;
 	u64 num_dirty_bgs;
+
+	/*
+	 * we need to make sure block group deletion doesn't race with
+	 * free space cache writeout.  This mutex keeps them from stomping
+	 * on each other
+	 */
+	struct mutex cache_write_mutex;
 	spinlock_t dirty_bgs_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
+	int dirty_bg_run;
 };
 
 #define __TRANS_FREEZABLE	(1U << 0)

From cdfb080e1853660952db5e5332727e59427856df Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Mon, 6 Apr 2015 18:17:00 -0700
Subject: [PATCH 63/93] Btrfs: fix use after free when close_ctree frees the
 orphan_rsv

Near the end of close_ctree, we're calling btrfs_free_block_rsv
to free up the orphan rsv.  The problem is this call updates the
space_info, which has already been freed.

This adds a new __ function that directly calls kfree instead of trying
to update the space infos.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       | 1 +
 fs/btrfs/disk-io.c     | 2 +-
 fs/btrfs/extent-tree.c | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 83051fae9467..10b6a75ab7e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3470,6 +3470,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 					      unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			enum btrfs_reserve_flush_enum flush);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b5e3d5f6400a..62cd3b63190d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3766,7 +3766,7 @@ void close_ctree(struct btrfs_root *root)
 
 	btrfs_free_stripe_hash_table(fs_info);
 
-	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	__btrfs_free_block_rsv(root->orphan_block_rsv);
 	root->orphan_block_rsv = NULL;
 
 	lock_chunks(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 02c2b29a0840..875ba519a131 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4918,6 +4918,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+	kfree(rsv);
+}
+
 int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			enum btrfs_reserve_flush_enum flush)

From df858e76723ace61342b118aa4302bd09de4e386 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Mar 2015 14:56:46 +0100
Subject: [PATCH 64/93] Btrfs: fix range cloning when same inode used as source
 and destination

While searching for extents to clone we might find one where we only use
a part of it coming from its tail. If our destination inode is the same
the source inode, we end up removing the tail part of the extent item and
insert after a new one that point to the same extent with an adjusted
key file offset and data offset. After this we search for the next extent
item in the fs/subvol tree with a key that has an offset incremented by
one. But this second search leaves us at the new extent item we inserted
previously, and since that extent item has a non-zero data offset, it
it can make us call btrfs_drop_extents with an empty range (start == end)
which causes the following warning:

[23978.537119] WARNING: CPU: 6 PID: 16251 at fs/btrfs/file.c:550 btrfs_drop_extent_cache+0x43/0x385 [btrfs]()
(...)
[23978.557266] Call Trace:
[23978.557978]  [<ffffffff81425fd9>] dump_stack+0x4c/0x65
[23978.559191]  [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb
[23978.560699]  [<ffffffffa047f0ea>] ? btrfs_drop_extent_cache+0x43/0x385 [btrfs]
[23978.562389]  [<ffffffff8104544d>] warn_slowpath_null+0x1a/0x1c
[23978.563613]  [<ffffffffa047f0ea>] btrfs_drop_extent_cache+0x43/0x385 [btrfs]
[23978.565103]  [<ffffffff810e3a18>] ? time_hardirqs_off+0x15/0x28
[23978.566294]  [<ffffffff81079ff8>] ? trace_hardirqs_off+0xd/0xf
[23978.567438]  [<ffffffffa047f73d>] __btrfs_drop_extents+0x6b/0x9e1 [btrfs]
[23978.568702]  [<ffffffff8107c03f>] ? trace_hardirqs_on+0xd/0xf
[23978.569763]  [<ffffffff811441c0>] ? ____cache_alloc+0x69/0x2eb
[23978.570817]  [<ffffffff81142269>] ? virt_to_head_page+0x9/0x36
[23978.571872]  [<ffffffff81143c15>] ? cache_alloc_debugcheck_after.isra.42+0x16c/0x1cb
[23978.573466]  [<ffffffff811420d5>] ? kmemleak_alloc_recursive.constprop.52+0x16/0x18
[23978.574962]  [<ffffffffa0480d07>] btrfs_drop_extents+0x66/0x7f [btrfs]
[23978.576179]  [<ffffffffa049aa35>] btrfs_clone+0x516/0xaf5 [btrfs]
[23978.577311]  [<ffffffffa04983dc>] ? lock_extent_range+0x7b/0xcd [btrfs]
[23978.578520]  [<ffffffffa049b2a2>] btrfs_ioctl_clone+0x28e/0x39f [btrfs]
[23978.580282]  [<ffffffffa049d9ae>] btrfs_ioctl+0xb51/0x219a [btrfs]
(...)
[23978.591887] ---[ end trace 988ec2a653d03ed3 ]---

Then we attempt to insert a new extent item with a key that already
exists, which makes btrfs_insert_empty_item return -EEXIST resulting in
abortion of the current transaction:

[23978.594355] WARNING: CPU: 6 PID: 16251 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
(...)
[23978.622589] Call Trace:
[23978.623181]  [<ffffffff81425fd9>] dump_stack+0x4c/0x65
[23978.624359]  [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb
[23978.625573]  [<ffffffffa044ab6c>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[23978.626971]  [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48
[23978.628003]  [<ffffffff8108a6c8>] ? vprintk_default+0x1d/0x1f
[23978.629138]  [<ffffffffa044ab6c>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[23978.630528]  [<ffffffffa049ad1b>] btrfs_clone+0x7fc/0xaf5 [btrfs]
[23978.631635]  [<ffffffffa04983dc>] ? lock_extent_range+0x7b/0xcd [btrfs]
[23978.632886]  [<ffffffffa049b2a2>] btrfs_ioctl_clone+0x28e/0x39f [btrfs]
[23978.634119]  [<ffffffffa049d9ae>] btrfs_ioctl+0xb51/0x219a [btrfs]
(...)
[23978.647714] ---[ end trace 988ec2a653d03ed4 ]---

This is wrong because we should not process the extent item that we just
inserted previously, and instead process the extent item that follows it
in the tree

For example for the test case I wrote for fstests:

   bs=$((64 * 1024))
   mkfs.btrfs -f -l $bs -O ^no-holes /dev/sdc
   mount /dev/sdc /mnt

   xfs_io -f -c "pwrite -S 0xaa $(($bs * 2)) $(($bs * 2))" /mnt/foo

   $CLONER_PROG -s $((3 * $bs)) -d $((267 * $bs)) -l 0 /mnt/foo /mnt/foo
   $CLONER_PROG -s $((217 * $bs)) -d $((95 * $bs)) -l 0 /mnt/foo /mnt/foo

The second clone call fails with -EEXIST, because when we process the
first extent item (offset 262144), we drop part of it (counting from the
end) and then insert a new extent item with a key greater then the key we
found. The next time we search the tree we search for a key with offset
262144 + 1, which leaves us at the new extent item we have just inserted
but we think it refers to an extent that we need to clone.

Fix this by ensuring the next search key uses an offset corresponding to
the offset of the key we found previously plus the data length of the
corresponding extent item. This ensures we skip new extent items that we
inserted and works for the case of implicit holes too (NO_HOLES feature).

A test case for fstests follows soon.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 94cde83eb3da..43defea9ab20 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3202,6 +3202,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 	key.offset = off;
 
 	while (1) {
+		u64 next_key_min_offset;
+
 		/*
 		 * note the key will change type as we walk through the
 		 * tree.
@@ -3282,7 +3284,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 			} else if (key.offset >= off + len) {
 				break;
 			}
-
+			next_key_min_offset = key.offset + datal;
 			size = btrfs_item_size_nr(leaf, slot);
 			read_extent_buffer(leaf, buf,
 					   btrfs_item_ptr_offset(leaf, slot),
@@ -3497,7 +3499,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 				break;
 		}
 		btrfs_release_path(path);
-		key.offset++;
+		key.offset = next_key_min_offset;
 	}
 	ret = 0;
 

From 113e8283869b9855c8b999796aadd506bbac155f Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 30 Mar 2015 18:26:47 +0100
Subject: [PATCH 65/93] Btrfs: fix inode eviction infinite loop after
 extent_same ioctl

If we pass a length of 0 to the extent_same ioctl, we end up locking an
extent range with a start offset greater then its end offset (if the
destination file's offset is greater than zero). This results in a warning
from extent_io.c:insert_state through the following call chain:

  btrfs_extent_same()
    btrfs_double_lock()
      lock_extent_range()
        lock_extent(inode->io_tree, offset, offset + len - 1)
          lock_extent_bits()
            __set_extent_bit()
              insert_state()
                --> WARN_ON(end < start)

This leads to an infinite loop when evicting the inode. This is the same
problem that my previous patch titled
"Btrfs: fix inode eviction infinite loop after cloning into it" addressed
but for the extent_same ioctl instead of the clone ioctl.

CC: <stable@vger.kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 43defea9ab20..3419f5e72bf9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2897,6 +2897,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
 	if (src == dst)
 		return -EINVAL;
 
+	if (len == 0)
+		return 0;
+
 	btrfs_double_lock(src, loff, dst, dst_loff, len);
 
 	ret = extent_same_check_offsets(src, loff, len);

From ccccf3d67294714af2d72a6fd6fd7d73b01c9329 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 30 Mar 2015 18:23:59 +0100
Subject: [PATCH 66/93] Btrfs: fix inode eviction infinite loop after cloning
 into it

If we attempt to clone a 0 length region into a file we can end up
inserting a range in the inode's extent_io tree with a start offset
that is greater then the end offset, which triggers immediately the
following warning:

[ 3914.619057] WARNING: CPU: 17 PID: 4199 at fs/btrfs/extent_io.c:435 insert_state+0x4b/0x10b [btrfs]()
[ 3914.620886] BTRFS: end < start 4095 4096
(...)
[ 3914.638093] Call Trace:
[ 3914.638636]  [<ffffffff81425fd9>] dump_stack+0x4c/0x65
[ 3914.639620]  [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb
[ 3914.640789]  [<ffffffffa03ca44f>] ? insert_state+0x4b/0x10b [btrfs]
[ 3914.642041]  [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48
[ 3914.643236]  [<ffffffffa03ca44f>] insert_state+0x4b/0x10b [btrfs]
[ 3914.644441]  [<ffffffffa03ca729>] __set_extent_bit+0x107/0x3f4 [btrfs]
[ 3914.645711]  [<ffffffffa03cb256>] lock_extent_bits+0x65/0x1bf [btrfs]
[ 3914.646914]  [<ffffffff8142b2fb>] ? _raw_spin_unlock+0x28/0x33
[ 3914.648058]  [<ffffffffa03cbac4>] ? test_range_bit+0xcc/0xde [btrfs]
[ 3914.650105]  [<ffffffffa03cb3c3>] lock_extent+0x13/0x15 [btrfs]
[ 3914.651361]  [<ffffffffa03db39e>] lock_extent_range+0x3d/0xcd [btrfs]
[ 3914.652761]  [<ffffffffa03de1fe>] btrfs_ioctl_clone+0x278/0x388 [btrfs]
[ 3914.654128]  [<ffffffff811226dd>] ? might_fault+0x58/0xb5
[ 3914.655320]  [<ffffffffa03e0909>] btrfs_ioctl+0xb51/0x2195 [btrfs]
(...)
[ 3914.669271] ---[ end trace 14843d3e2e622fc1 ]---

This later makes the inode eviction handler enter an infinite loop that
keeps dumping the following warning over and over:

[ 3915.117629] WARNING: CPU: 22 PID: 4228 at fs/btrfs/extent_io.c:435 insert_state+0x4b/0x10b [btrfs]()
[ 3915.119913] BTRFS: end < start 4095 4096
(...)
[ 3915.137394] Call Trace:
[ 3915.137913]  [<ffffffff81425fd9>] dump_stack+0x4c/0x65
[ 3915.139154]  [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb
[ 3915.140316]  [<ffffffffa03ca44f>] ? insert_state+0x4b/0x10b [btrfs]
[ 3915.141505]  [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48
[ 3915.142709]  [<ffffffffa03ca44f>] insert_state+0x4b/0x10b [btrfs]
[ 3915.143849]  [<ffffffffa03ca729>] __set_extent_bit+0x107/0x3f4 [btrfs]
[ 3915.145120]  [<ffffffffa038c1e3>] ? btrfs_kill_super+0x17/0x23 [btrfs]
[ 3915.146352]  [<ffffffff811548f6>] ? deactivate_locked_super+0x3b/0x50
[ 3915.147565]  [<ffffffffa03cb256>] lock_extent_bits+0x65/0x1bf [btrfs]
[ 3915.148785]  [<ffffffff8142b7e2>] ? _raw_write_unlock+0x28/0x33
[ 3915.149931]  [<ffffffffa03bc325>] btrfs_evict_inode+0x196/0x482 [btrfs]
[ 3915.151154]  [<ffffffff81168904>] evict+0xa0/0x148
[ 3915.152094]  [<ffffffff811689e5>] dispose_list+0x39/0x43
[ 3915.153081]  [<ffffffff81169564>] evict_inodes+0xdc/0xeb
[ 3915.154062]  [<ffffffff81154418>] generic_shutdown_super+0x49/0xef
[ 3915.155193]  [<ffffffff811546d1>] kill_anon_super+0x13/0x1e
[ 3915.156274]  [<ffffffffa038c1e3>] btrfs_kill_super+0x17/0x23 [btrfs]
(...)
[ 3915.167404] ---[ end trace 14843d3e2e622fc2 ]---

So just bail out of the clone ioctl if the length of the region to clone
is zero, without locking any extent range, in order to prevent this issue
(same behaviour as a pwrite with a 0 length for example).

This is trivial to reproduce. For example, the steps for the test I just
made for fstests:

  mkfs.btrfs -f SCRATCH_DEV
  mount SCRATCH_DEV $SCRATCH_MNT

  touch $SCRATCH_MNT/foo
  touch $SCRATCH_MNT/bar

  $CLONER_PROG -s 0 -d 4096 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
  umount $SCRATCH_MNT

A test case for fstests follows soon.

CC: <stable@vger.kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3419f5e72bf9..97b782ff92bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3631,6 +3631,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	if (off + len == src->i_size)
 		len = ALIGN(src->i_size, bs) - off;
 
+	if (len == 0) {
+		ret = 0;
+		goto out_unlock;
+	}
+
 	/* verify the end result is block aligned */
 	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
 	    !IS_ALIGNED(destoff, bs))

From de249e66a73d696666281cd812087979c6fae552 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Sat, 11 Apr 2015 05:09:06 -0700
Subject: [PATCH 67/93] Btrfs: fix uninit variable in clone ioctl

Commit 0d97a64e0 creates a new variable but doesn't always set it up.
This puts it back to the original method (key.offset + 1) for the cases
not covered by Filipe's new logic.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97b782ff92bb..ca0736ce25a0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3205,7 +3205,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 	key.offset = off;
 
 	while (1) {
-		u64 next_key_min_offset;
+		u64 next_key_min_offset = key.offset + 1;
 
 		/*
 		 * note the key will change type as we walk through the

From 94b947b2f3f84f3bba25d34c4e2a229fc2276830 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Sat, 14 Feb 2015 13:23:45 +0800
Subject: [PATCH 68/93] btrfs: fix condition of commit transaction

Old code bypass commit transaction when we don't have enough
pinned space, but another case is there exist freed bgs in current
transction, it have possibility to make alloc_chunk success.

This patch modify the condition to:
if (have_free_bg || have_pinned_space) commit_transaction()

Confirmed above action by printk before and after patch.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 875ba519a131..7b227a65078c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3857,7 +3857,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 used;
-	int ret = 0, committed = 0;
+	int ret = 0;
+	int committed = 0;
+	int have_pinned_space = 1;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
@@ -3925,11 +3927,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 
 		/*
 		 * If we don't have enough pinned space to deal with this
-		 * allocation don't bother committing the transaction.
+		 * allocation, and no removed chunk in current transaction,
+		 * don't bother committing the transaction.
 		 */
 		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
 					   bytes) < 0)
-			committed = 1;
+			have_pinned_space = 0;
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
@@ -3941,10 +3944,15 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
-			ret = btrfs_commit_transaction(trans, root);
-			if (ret)
-				return ret;
-			goto again;
+			if (have_pinned_space ||
+			    trans->transaction->have_free_bgs) {
+				ret = btrfs_commit_transaction(trans, root);
+				if (ret)
+					return ret;
+				goto again;
+			} else {
+				btrfs_end_transaction(trans, root);
+			}
 		}
 
 		trace_btrfs_space_reservation(root->fs_info,

From f2ab76188ec185dde84e7fe7c533ef2f5d668a32 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 16 Feb 2015 18:52:17 +0800
Subject: [PATCH 69/93] btrfs: Fix tail space processing in
 find_free_dev_extent()

It is another reason for NO_SPACE case.

When we found enough free space in loop and saved them to
max_hole_start/size before, and tail space contains pending extent,
origional innocent max_hole_start/size are reset in retry.

As a result, find_free_dev_extent() returns less space than it can,
and cause NO_SPACE in user program.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a73acf496e10..8bcd2a007517 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1134,11 +1134,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-again:
+
 	max_hole_start = search_start;
 	max_hole_size = 0;
-	hole_size = 0;
 
+again:
 	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
 		ret = -ENOSPC;
 		goto out;
@@ -1231,21 +1231,23 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	 * allocated dev extents, and when shrinking the device,
 	 * search_end may be smaller than search_start.
 	 */
-	if (search_end > search_start)
+	if (search_end > search_start) {
 		hole_size = search_end - search_start;
 
-	if (hole_size > max_hole_size) {
-		max_hole_start = search_start;
-		max_hole_size = hole_size;
-	}
+		if (contains_pending_extent(trans, device, &search_start,
+					    hole_size)) {
+			btrfs_release_path(path);
+			goto again;
+		}
 
-	if (contains_pending_extent(trans, device, &search_start, hole_size)) {
-		btrfs_release_path(path);
-		goto again;
+		if (hole_size > max_hole_size) {
+			max_hole_start = search_start;
+			max_hole_size = hole_size;
+		}
 	}
 
 	/* See above. */
-	if (hole_size < num_bytes)
+	if (max_hole_size < num_bytes)
 		ret = -ENOSPC;
 	else
 		ret = 0;

From 264ca0f60becac34395f3b42171ad0cc2e53ea6e Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 17 Feb 2015 17:25:51 +0800
Subject: [PATCH 70/93] btrfs: Adjust commit-transaction condition to avoid
 NO_SPACE more

If we have any chance to make a successful write, we should not give up.

This patch adjust commit-transaction condition from:
  pinned >= wanted
to
  left + pinned >= wanted

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b227a65078c..01f4db6554a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3931,7 +3931,8 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 		 * don't bother committing the transaction.
 		 */
 		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
-					   bytes) < 0)
+					   used + bytes -
+					   data_sinfo->total_bytes) < 0)
 			have_pinned_space = 0;
 		spin_unlock(&data_sinfo->lock);
 

From c30666d466c70a30491e45dd8d068360f9dd7693 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Wed, 25 Feb 2015 14:17:20 +0800
Subject: [PATCH 71/93] btrfs: Set relative data on clear
 btrfs_block_group_cache->pinned

Bug1:
  space_info->bytes_readonly was set to very large(negative) value in
  btrfs_remove_block_group().

Reason:
  Current code set block_group_cache->pinned = 0 in btrfs_delete_unused_bgs(),
  but above space was not counted to space_info->bytes_readonly.

  Then in btrfs_remove_block_group():
    block_group->space_info->bytes_readonly -= block_group->key.offset;
  We can see following value in trace:
    btrfs_remove_block_group: pid=2677 comm=btrfs-cleaner WARNING: bytes_readonly=12582912, key.offset=134217728

Bug2:
  space_info->total_bytes_pinned grow to value larger than fs size.
  In a 1.2G fs, we can get following trace log:
  at first:
    ZL_DEBUG: add_pinned_bytes: pid=2710 comm=sync change total_bytes_pinned flags=1 869793792 + 95944704 = 965738496
  after some op:
    ZL_DEBUG: add_pinned_bytes: pid=2770 comm=sync change total_bytes_pinned flags=1 1780178944 + 95944704 = 1876123648
  after some op:
    ZL_DEBUG: add_pinned_bytes: pid=3193 comm=sync change total_bytes_pinned flags=1 2924568576 + 95551488 = 3020120064
  ...

Reason:
  Similar to bug1, we also need to adjust space_info->total_bytes_pinned
  in above code block.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01f4db6554a1..7a14a4aa50a7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9922,8 +9922,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 
 		/* Reset pinned so btrfs_put_block_group doesn't complain */
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+
+		space_info->bytes_pinned -= block_group->pinned;
+		space_info->bytes_readonly += block_group->pinned;
+		percpu_counter_add(&space_info->total_bytes_pinned,
+				   -block_group->pinned);
 		block_group->pinned = 0;
 
+		spin_unlock(&block_group->lock);
+		spin_unlock(&space_info->lock);
+
 		/*
 		 * Btrfs_remove_chunk will abort the transaction if things go
 		 * horribly wrong.

From 18d018ad2c899d3d9c503c25125d56046ed7d3ca Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 24 Feb 2015 20:07:44 +0800
Subject: [PATCH 72/93] btrfs: add WARN_ON() to check is space_info op current

space_info's value calculation is some complex and easy to cause
bug, add WARN_ON() to help debug.

Changelog v1->v2:
 Put WARN_ON()s under the ENOSPC_DEBUG mount option.
 Suggested by: David Sterba <dsterba@suse.cz>

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a14a4aa50a7..be4a79a69ed1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9732,9 +9732,19 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	spin_lock(&block_group->space_info->lock);
 	list_del_init(&block_group->ro_list);
+
+	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		WARN_ON(block_group->space_info->total_bytes
+			< block_group->key.offset);
+		WARN_ON(block_group->space_info->bytes_readonly
+			< block_group->key.offset);
+		WARN_ON(block_group->space_info->disk_total
+			< block_group->key.offset * factor);
+	}
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
 	block_group->space_info->disk_total -= block_group->key.offset * factor;
+
 	spin_unlock(&block_group->space_info->lock);
 
 	memcpy(&key, &block_group->key, sizeof(key));

From d7c151717a1efe289aec29fb9f94485f64262c0b Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 26 Feb 2015 10:49:20 +0800
Subject: [PATCH 73/93] btrfs: Fix NO_SPACE bug caused by delayed-iput

Steps to reproduce:
  while true; do
    dd if=/dev/zero of=/btrfs_dir/file count=[fs_size * 75%]
    rm /btrfs_dir/file
    sync
  done

  And we'll see dd failed because btrfs return NO_SPACE.

Reason:
  Normally, btrfs_commit_transaction() call btrfs_run_delayed_iputs()
  in end to free fs space for next write, but sometimes it hadn't
  done work on time, because btrfs-cleaner thread get delayed-iputs
  from list before, but do iput() after next write.

  This is log:
  [ 2569.050776] comm=btrfs-cleaner func=btrfs_evict_inode() begin

  [ 2569.084280] comm=sync func=btrfs_commit_transaction() call btrfs_run_delayed_iputs()
  [ 2569.085418] comm=sync func=btrfs_commit_transaction() done btrfs_run_delayed_iputs()
  [ 2569.087554] comm=sync func=btrfs_commit_transaction() end

  [ 2569.191081] comm=dd begin
  [ 2569.790112] comm=dd func=__btrfs_buffered_write() ret=-28

  [ 2569.847479] comm=btrfs-cleaner func=add_pinned_bytes() 0 + 32677888 = 32677888
  [ 2569.849530] comm=btrfs-cleaner func=add_pinned_bytes() 32677888 + 23834624 = 56512512
  ...
  [ 2569.903893] comm=btrfs-cleaner func=add_pinned_bytes() 943976448 + 21762048 = 965738496
  [ 2569.908270] comm=btrfs-cleaner func=btrfs_evict_inode() end

Fix:
  Make btrfs_commit_transaction() wait current running btrfs-cleaner's
  delayed-iputs() done in end.

Test:
  Use script similar to above(more complex),
  before patch:
    7 failed in 100 * 20 loop.
  after patch:
    0 failed in 100 * 20 loop.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       | 1 +
 fs/btrfs/disk-io.c     | 3 ++-
 fs/btrfs/extent-tree.c | 6 ++++++
 fs/btrfs/inode.c       | 4 ++++
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 10b6a75ab7e6..d48b22f31182 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1538,6 +1538,7 @@ struct btrfs_fs_info {
 
 	spinlock_t delayed_iput_lock;
 	struct list_head delayed_iputs;
+	struct rw_semaphore delayed_iput_sem;
 
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62cd3b63190d..2ef9a4b72d06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2487,11 +2487,12 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->qgroup_op_lock);
 	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->unused_bgs_lock);
-	mutex_init(&fs_info->unused_bg_unpin_mutex);
 	rwlock_init(&fs_info->tree_mod_log_lock);
+	mutex_init(&fs_info->unused_bg_unpin_mutex);
 	mutex_init(&fs_info->reloc_mutex);
 	mutex_init(&fs_info->delalloc_root_mutex);
 	seqlock_init(&fs_info->profiles_lock);
+	init_rwsem(&fs_info->delayed_iput_sem);
 
 	init_completion(&fs_info->kobj_unregister);
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be4a79a69ed1..46cb1d414912 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3950,6 +3950,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 				ret = btrfs_commit_transaction(trans, root);
 				if (ret)
 					return ret;
+				/*
+				 * make sure that all running delayed iput are
+				 * done
+				 */
+				down_write(&root->fs_info->delayed_iput_sem);
+				up_write(&root->fs_info->delayed_iput_sem);
 				goto again;
 			} else {
 				btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 141df0ce6f93..6ef97c184c7b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3111,6 +3111,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	if (empty)
 		return;
 
+	down_read(&fs_info->delayed_iput_sem);
+
 	spin_lock(&fs_info->delayed_iput_lock);
 	list_splice_init(&fs_info->delayed_iputs, &list);
 	spin_unlock(&fs_info->delayed_iput_lock);
@@ -3121,6 +3123,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 		iput(delayed->inode);
 		kfree(delayed);
 	}
+
+	up_read(&root->fs_info->delayed_iput_sem);
 }
 
 /*

From c99f1b0c6c45d1621f08afb1352689e24a627844 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 2 Mar 2015 19:32:20 +0800
Subject: [PATCH 74/93] btrfs: Support busy loop of write and delete

Reproduce:
 while true; do
   dd if=/dev/zero of=/mnt/btrfs/file count=[75% fs_size]
   rm /mnt/btrfs/file
 done
 Then we can see above loop failed on NO_SPACE.

It it long-term problem since very beginning, because delayed-iput
after rm are not run.

We already have commit_transaction() in alloc_space code, but it is
not triggered in above case.
This patch trigger commit_transaction() to run delayed-iput and
reflash pinned-space to to make write success.

It is based on previous fix of delayed-iput in commit_transaction(),
need to be applied on top of:
btrfs: Fix NO_SPACE bug caused by delayed-iput

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 46cb1d414912..2713dcbc70f7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3858,14 +3858,14 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 used;
 	int ret = 0;
-	int committed = 0;
-	int have_pinned_space = 1;
+	int need_commit = 2;
+	int have_pinned_space;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
 
 	if (btrfs_is_free_space_inode(inode)) {
-		committed = 1;
+		need_commit = 0;
 		ASSERT(current->journal_info);
 	}
 
@@ -3915,8 +3915,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 			if (ret < 0) {
 				if (ret != -ENOSPC)
 					return ret;
-				else
+				else {
+					have_pinned_space = 1;
 					goto commit_trans;
+				}
 			}
 
 			if (!data_sinfo)
@@ -3930,23 +3932,23 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 		 * allocation, and no removed chunk in current transaction,
 		 * don't bother committing the transaction.
 		 */
-		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
-					   used + bytes -
-					   data_sinfo->total_bytes) < 0)
-			have_pinned_space = 0;
+		have_pinned_space = percpu_counter_compare(
+			&data_sinfo->total_bytes_pinned,
+			used + bytes - data_sinfo->total_bytes);
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
 commit_trans:
-		if (!committed &&
+		if (need_commit &&
 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
-			committed = 1;
+			need_commit--;
 
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
-			if (have_pinned_space ||
-			    trans->transaction->have_free_bgs) {
+			if (have_pinned_space >= 0 ||
+			    trans->transaction->have_free_bgs ||
+			    need_commit > 0) {
 				ret = btrfs_commit_transaction(trans, root);
 				if (ret)
 					return ret;

From 3eeb4d597efc9e068902057f1bd040cffc69e9e6 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Thu, 20 Nov 2014 20:14:38 -0500
Subject: [PATCH 75/93] btrfs: qgroup: inherit limit info from srcgroup in
 creating snapshot.

Currently, when we snapshot a subvol, snapshot will not copy the limits
from srcqgroup.

This patch make the qgroup in snapshot inherit the limit info when create
a snapshot.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 37188c6b408d..34c2753fa4e1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2302,6 +2302,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 		dstgroup->excl_cmpr = level_size;
 		srcgroup->excl = level_size;
 		srcgroup->excl_cmpr = level_size;
+
+		/* inherit the limit info */
+		dstgroup->lim_flags = srcgroup->lim_flags;
+		dstgroup->max_rfer = srcgroup->max_rfer;
+		dstgroup->max_excl = srcgroup->max_excl;
+		dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+		dstgroup->rsv_excl = srcgroup->rsv_excl;
+
 		qgroup_dirty(fs_info, dstgroup);
 		qgroup_dirty(fs_info, srcgroup);
 	}

From e8c8541ac379709db8d2339e1cb720469fc2cd8f Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Thu, 20 Nov 2014 20:58:34 -0500
Subject: [PATCH 76/93] btrfs: qgroup: update qgroup in memory at the same time
 when we update it in btree.

When we call btrfs_qgroup_inherit() with BTRFS_QGROUP_INHERIT_SET_LIMITS,
btrfs will update the limit info of qgroup in btree but forget to update
the qgroup in rbtree at the same time. It obviousely will cause an inconsistency.

This patch fix it by updating the rbtree at the same time.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 34c2753fa4e1..bf926f766273 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2230,17 +2230,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
-		ret = update_qgroup_limit_item(trans, quota_root, objectid,
-					       inherit->lim.flags,
-					       inherit->lim.max_rfer,
-					       inherit->lim.max_excl,
-					       inherit->lim.rsv_rfer,
-					       inherit->lim.rsv_excl);
-		if (ret)
-			goto out;
-	}
-
 	if (srcid) {
 		struct btrfs_root *srcroot;
 		struct btrfs_key srckey;
@@ -2286,6 +2275,23 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 		goto unlock;
 	}
 
+	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+		ret = update_qgroup_limit_item(trans, quota_root, objectid,
+					       inherit->lim.flags,
+					       inherit->lim.max_rfer,
+					       inherit->lim.max_excl,
+					       inherit->lim.rsv_rfer,
+					       inherit->lim.rsv_excl);
+		if (ret)
+			goto unlock;
+
+		dstgroup->lim_flags = inherit->lim.flags;
+		dstgroup->max_rfer = inherit->lim.max_rfer;
+		dstgroup->max_excl = inherit->lim.max_excl;
+		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+		dstgroup->rsv_excl = inherit->lim.rsv_excl;
+	}
+
 	if (srcid) {
 		srcgroup = find_qgroup_rb(fs_info, srcid);
 		if (!srcgroup)

From 1510e71c620c27ffc7540176a0689f70d6915e28 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Thu, 20 Nov 2014 21:01:41 -0500
Subject: [PATCH 77/93] btrfs: qgroup: consolidate the parameter of fucntion
 update_qgroup_limit_item().

Cleanup: Change the parameter of update_qgroup_limit_item() to the family of
update_qgroup_xxx_item().

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 51 ++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bf926f766273..bf0702a865f8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -644,9 +644,8 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans,
 }
 
 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root, u64 qgroupid,
-				    u64 flags, u64 max_rfer, u64 max_excl,
-				    u64 rsv_rfer, u64 rsv_excl)
+				    struct btrfs_root *root,
+				    struct btrfs_qgroup *qgroup)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -657,7 +656,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 
 	key.objectid = 0;
 	key.type = BTRFS_QGROUP_LIMIT_KEY;
-	key.offset = qgroupid;
+	key.offset = qgroup->qgroupid;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -673,11 +672,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 	l = path->nodes[0];
 	slot = path->slots[0];
 	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
-	btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
-	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
-	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
-	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
-	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
+	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
+	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
+	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
+	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 
 	btrfs_mark_buffer_dirty(l);
 
@@ -1184,15 +1183,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 		ret = -ENOENT;
 		goto out;
 	}
-	ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
-				       limit->flags, limit->max_rfer,
-				       limit->max_excl, limit->rsv_rfer,
-				       limit->rsv_excl);
-	if (ret) {
-		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-		btrfs_info(fs_info, "unable to update quota limit for %llu",
-		       qgroupid);
-	}
 
 	spin_lock(&fs_info->qgroup_lock);
 	qgroup->lim_flags = limit->flags;
@@ -1201,6 +1191,14 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 	qgroup->rsv_rfer = limit->rsv_rfer;
 	qgroup->rsv_excl = limit->rsv_excl;
 	spin_unlock(&fs_info->qgroup_lock);
+
+	ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+	if (ret) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		btrfs_info(fs_info, "unable to update quota limit for %llu",
+		       qgroupid);
+	}
+
 out:
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
@@ -2276,20 +2274,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	}
 
 	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
-		ret = update_qgroup_limit_item(trans, quota_root, objectid,
-					       inherit->lim.flags,
-					       inherit->lim.max_rfer,
-					       inherit->lim.max_excl,
-					       inherit->lim.rsv_rfer,
-					       inherit->lim.rsv_excl);
-		if (ret)
-			goto unlock;
-
 		dstgroup->lim_flags = inherit->lim.flags;
 		dstgroup->max_rfer = inherit->lim.max_rfer;
 		dstgroup->max_excl = inherit->lim.max_excl;
 		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
 		dstgroup->rsv_excl = inherit->lim.rsv_excl;
+
+		ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
+		if (ret) {
+			fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+			btrfs_info(fs_info, "unable to update quota limit for %llu",
+			       dstgroup->qgroupid);
+			goto unlock;
+		}
 	}
 
 	if (srcid) {

From d3001ed3a82ec2696bb13c78092d0a3460003fd7 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Thu, 20 Nov 2014 21:04:56 -0500
Subject: [PATCH 78/93] btrfs: qgroup: update limit info in function
 btrfs_run_qgroups().

When we commit_transaction(), qgroups in btree should be updated.
But, limit info is not considered currently. It will cause a problem
when a qgroup of a snapshot inherit the limit info from srcqgroup,
then there is an inconsistency.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bf0702a865f8..5d1d75611745 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2151,6 +2151,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 		list_del_init(&qgroup->dirty);
 		spin_unlock(&fs_info->qgroup_lock);
 		ret = update_qgroup_info_item(trans, quota_root, qgroup);
+		if (ret)
+			fs_info->qgroup_flags |=
+					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		ret = update_qgroup_limit_item(trans, quota_root, qgroup);
 		if (ret)
 			fs_info->qgroup_flags |=
 					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;

From 03477d945f13a284d35a757b2c2323d165d5cd81 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Fri, 6 Feb 2015 11:06:25 -0500
Subject: [PATCH 79/93] btrfs: qgroup: fix limit args override whole limit
 struct

btrfs_limit_group use arg limit to override the old qgroup_limit of
corresponding qgroup. However, we should override part of old qgroup_limit
according to the bit which has been set in arg limit.

Signed-off-by: Fan Chengniang <fancn.fnst@cn.fujitsu.com>
Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5d1d75611745..5c2aaccbe3da 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1185,11 +1185,16 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
-	qgroup->lim_flags = limit->flags;
-	qgroup->max_rfer = limit->max_rfer;
-	qgroup->max_excl = limit->max_excl;
-	qgroup->rsv_rfer = limit->rsv_rfer;
-	qgroup->rsv_excl = limit->rsv_excl;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
+		qgroup->max_rfer = limit->max_rfer;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+		qgroup->max_excl = limit->max_excl;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
+		qgroup->rsv_rfer = limit->rsv_rfer;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
+		qgroup->rsv_excl = limit->rsv_excl;
+	qgroup->lim_flags |= limit->flags;
+
 	spin_unlock(&fs_info->qgroup_lock);
 
 	ret = update_qgroup_limit_item(trans, quota_root, qgroup);

From 4087cf24ae2af17f7dd9fd34e22fde816952d421 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Sun, 18 Jan 2015 10:59:23 -0500
Subject: [PATCH 80/93] Btrfs: qgroup: cleanup, remove an unsued parameter in
 btrfs_create_qgroup().

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c              | 3 +--
 fs/btrfs/qgroup.c             | 2 +-
 fs/btrfs/qgroup.h             | 3 +--
 fs/btrfs/tests/qgroup-tests.c | 4 ++--
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ca0736ce25a0..2e66d8e76f32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4679,8 +4679,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 
 	/* FIXME: check if the IDs really exist */
 	if (sa->create) {
-		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
-					  NULL);
+		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
 	} else {
 		ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
 	}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5c2aaccbe3da..de321c90130c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1094,7 +1094,7 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+			struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 18cc68ca3090..c5242aa9a4b2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -70,8 +70,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-			struct btrfs_fs_info *fs_info, u64 qgroupid,
-			char *name);
+			struct btrfs_fs_info *fs_info, u64 qgroupid);
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 qgroupid);
 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 73f299ebdabb..c32a7ba76bca 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -232,7 +232,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 	init_dummy_trans(&trans);
 
 	test_msg("Qgroup basic add\n");
-	ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+	ret = btrfs_create_qgroup(NULL, fs_info, 5);
 	if (ret) {
 		test_msg("Couldn't create a qgroup %d\n", ret);
 		return ret;
@@ -301,7 +301,7 @@ static int test_multiple_refs(struct btrfs_root *root)
 	test_msg("Qgroup multiple refs test\n");
 
 	/* We have 5 created already from the previous test */
-	ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+	ret = btrfs_create_qgroup(NULL, fs_info, 256);
 	if (ret) {
 		test_msg("Couldn't create a qgroup %d\n", ret);
 		return ret;

From 804ca127fb93988c6a9d5f2bf4a8f1a780c9a2d0 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Fri, 12 Dec 2014 16:44:34 +0800
Subject: [PATCH 81/93] Btrfs: qgroup: free reserved in exceeding quota.

When we exceed quota limit in writing, we will free
some reserved extent when we need to drop but not free
account in qgroup. It means, each time we exceed quota
in writing, there will be some remain space in qg->reserved
we can not use any more. If things go on like this, the
all space will be ate up.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2713dcbc70f7..695d5110e020 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5475,8 +5475,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 			to_free = 0;
 	}
 	spin_unlock(&BTRFS_I(inode)->lock);
-	if (dropped)
+	if (dropped) {
+		if (root->fs_info->quota_enabled)
+			btrfs_qgroup_free(root, dropped * root->nodesize);
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+	}
 
 	if (to_free) {
 		btrfs_block_rsv_release(root, block_rsv, to_free);

From 31193213f1f9c13f6485007ef1e233b119e46910 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Fri, 12 Dec 2014 16:44:35 +0800
Subject: [PATCH 82/93] Btrfs: qgroup: Introduce a may_use to account
 space_info->bytes_may_use.

Currently, for pre_alloc or delay_alloc, the bytes will be accounted
in space_info by the three guys.
space_info->bytes_may_use --- space_info->reserved --- space_info->used.
But on the other hand, in qgroup, there are only two counters to account the
bytes, qgroup->reserved and qgroup->excl. And qg->reserved accounts
bytes in space_info->bytes_may_use and qg->excl accounts bytes in
space_info->used. So the bytes in space_info->reserved is not accounted
in qgroup. If so, there is a window we can exceed the quota limit when
bytes is in space_info->reserved.

Example:
	# btrfs quota enable /mnt
	# btrfs qgroup limit -e 10M /mnt
	# for((i=0;i<20;i++));do fallocate -l 1M /mnt/data$i; done
	# sync
	# btrfs qgroup show -pcre /mnt
qgroupid rfer     excl     max_rfer max_excl parent  child
-------- ----     ----     -------- -------- ------  -----
0/5      20987904 20987904 0        10485760 ---     ---

qg->excl is 20987904 larger than max_excl 10485760.

This patch introduce a new counter named may_use to qgroup, then
there are three counters in qgroup to account bytes in space_info
as below.
space_info->bytes_may_use --- space_info->reserved --- space_info->used.
qgroup->may_use           --- qgroup->reserved     --- qgroup->excl

With this patch applied:
	# btrfs quota enable /mnt
	# btrfs qgroup limit -e 10M /mnt
	# for((i=0;i<20;i++));do fallocate -l 1M /mnt/data$i; done
fallocate: /mnt/data9: fallocate failed: Disk quota exceeded
fallocate: /mnt/data10: fallocate failed: Disk quota exceeded
fallocate: /mnt/data11: fallocate failed: Disk quota exceeded
fallocate: /mnt/data12: fallocate failed: Disk quota exceeded
fallocate: /mnt/data13: fallocate failed: Disk quota exceeded
fallocate: /mnt/data14: fallocate failed: Disk quota exceeded
fallocate: /mnt/data15: fallocate failed: Disk quota exceeded
fallocate: /mnt/data16: fallocate failed: Disk quota exceeded
fallocate: /mnt/data17: fallocate failed: Disk quota exceeded
fallocate: /mnt/data18: fallocate failed: Disk quota exceeded
fallocate: /mnt/data19: fallocate failed: Disk quota exceeded
	# sync
	# btrfs qgroup show -pcre /mnt
qgroupid rfer    excl    max_rfer max_excl parent  child
-------- ----    ----    -------- -------- ------  -----
0/5      9453568 9453568 0        10485760 ---     ---

Reported-by: Cyril SCETBON <cyril.scetbon@free.fr>
Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++-
 fs/btrfs/inode.c       | 18 ++++++++++-
 fs/btrfs/qgroup.c      | 68 +++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/qgroup.h      |  4 +++
 4 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 695d5110e020..3113e0b79b99 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5725,8 +5725,12 @@ static int pin_down_extent(struct btrfs_root *root,
 
 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
-	if (reserved)
+	if (reserved) {
+		btrfs_qgroup_update_reserved_bytes(root->fs_info,
+						   root->root_key.objectid,
+						   num_bytes, -1);
 		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
+	}
 	return 0;
 }
 
@@ -6464,6 +6468,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		btrfs_put_block_group(cache);
 		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
 		pin = 0;
+		btrfs_qgroup_update_reserved_bytes(root->fs_info,
+						   root->root_key.objectid,
+						   buf->len, -1);
 	}
 out:
 	if (pin)
@@ -7196,7 +7203,11 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 			ret = btrfs_discard_extent(root, start, len, NULL);
 		btrfs_add_free_space(cache, start, len);
 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+		btrfs_qgroup_update_reserved_bytes(root->fs_info,
+						   root->root_key.objectid,
+						   len, -1);
 	}
+
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -7433,6 +7444,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret); /* logic error */
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
+	btrfs_qgroup_update_reserved_bytes(root->fs_info,
+					   root->root_key.objectid,
+					   ins->offset, 1);
 	btrfs_put_block_group(block_group);
 	return ret;
 }
@@ -7579,6 +7593,10 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
+	btrfs_qgroup_update_reserved_bytes(root->fs_info,
+					   root_objectid,
+					   ins.offset, 1);
+
 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6ef97c184c7b..a9f69a0d4b08 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -60,6 +60,7 @@
 #include "backref.h"
 #include "hash.h"
 #include "props.h"
+#include "qgroup.h"
 
 struct btrfs_iget_args {
 	struct btrfs_key *location;
@@ -753,7 +754,9 @@ static noinline void submit_compressed_extents(struct inode *inode,
 			}
 			goto out_free;
 		}
-
+		btrfs_qgroup_update_reserved_bytes(root->fs_info,
+						   root->root_key.objectid,
+						   ins.offset, 1);
 		/*
 		 * here we're doing allocation and writeback of the
 		 * compressed pages
@@ -978,6 +981,10 @@ static noinline int cow_file_range(struct inode *inode,
 		if (ret < 0)
 			goto out_unlock;
 
+		btrfs_qgroup_update_reserved_bytes(root->fs_info,
+						   root->root_key.objectid,
+						   ins.offset, 1);
+
 		em = alloc_extent_map();
 		if (!em) {
 			ret = -ENOMEM;
@@ -7030,6 +7037,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 		return ERR_PTR(ret);
 	}
 
+	btrfs_qgroup_update_reserved_bytes(root->fs_info,
+					   root->root_key.objectid,
+					   ins.offset, 1);
+
 	return em;
 }
 
@@ -9583,6 +9594,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 				btrfs_end_transaction(trans, root);
 			break;
 		}
+
+		btrfs_qgroup_update_reserved_bytes(root->fs_info,
+						   root->root_key.objectid,
+						   ins.offset, 1);
+
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index de321c90130c..cd291733dc3e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -72,6 +72,7 @@ struct btrfs_qgroup {
 	/*
 	 * reservation tracking
 	 */
+	u64 may_use;
 	u64 reserved;
 
 	/*
@@ -1417,6 +1418,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
 	qgroup->excl += sign * oper->num_bytes;
 	qgroup->excl_cmpr += sign * oper->num_bytes;
+	if (sign > 0)
+		qgroup->reserved -= oper->num_bytes;
 
 	qgroup_dirty(fs_info, qgroup);
 
@@ -1436,6 +1439,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 		qgroup->rfer_cmpr += sign * oper->num_bytes;
 		WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
 		qgroup->excl += sign * oper->num_bytes;
+		if (sign > 0)
+			qgroup->reserved -= oper->num_bytes;
 		qgroup->excl_cmpr += sign * oper->num_bytes;
 		qgroup_dirty(fs_info, qgroup);
 
@@ -2378,6 +2383,61 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_qgroup_update_reserved_bytes(struct btrfs_fs_info *fs_info,
+					    u64 ref_root,
+					    u64 num_bytes,
+					    int sign)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+
+	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+		return 0;
+
+	if (num_bytes == 0)
+		return 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		goto out;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = u64_to_ptr(unode->aux);
+
+		qg->reserved += sign * num_bytes;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	return ret;
+}
+
 /*
  * reserve some space for a qgroup and all its parents. The reservation takes
  * place with start_transaction or dealloc_reserve, similar to ENOSPC
@@ -2426,14 +2486,14 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 		qg = u64_to_ptr(unode->aux);
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
-		    qg->reserved + (s64)qg->rfer + num_bytes >
+		    qg->reserved + qg->may_use + (s64)qg->rfer + num_bytes >
 		    qg->max_rfer) {
 			ret = -EDQUOT;
 			goto out;
 		}
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
-		    qg->reserved + (s64)qg->excl + num_bytes >
+		    qg->reserved + qg->may_use + (s64)qg->excl + num_bytes >
 		    qg->max_excl) {
 			ret = -EDQUOT;
 			goto out;
@@ -2457,7 +2517,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 		qg = u64_to_ptr(unode->aux);
 
-		qg->reserved += num_bytes;
+		qg->may_use += num_bytes;
 	}
 
 out:
@@ -2503,7 +2563,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 
 		qg = u64_to_ptr(unode->aux);
 
-		qg->reserved -= num_bytes;
+		qg->may_use -= num_bytes;
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c5242aa9a4b2..64d49b8482b3 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -94,6 +94,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_update_reserved_bytes(struct btrfs_fs_info *fs_info,
+				       u64 ref_root,
+				       u64 num_bytes,
+				       int sign);
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
 void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
 

From 237c0e9f1fbfdca7287f3539f1fa73e5063156b5 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Mon, 29 Dec 2014 06:23:05 -0500
Subject: [PATCH 83/93] Btrfs: qgroup, Account data space in more proper
 timings.

Currenly, in data writing, ->reserved is accounted in
fill_delalloc(), but ->may_use is released in clear_bit_hook()
which is called by btrfs_finish_ordered_io(). That's too late,
that said, between fill_delalloc() and btrfs_finish_ordered_io(),
the data is doublely accounted by qgroup. It will cause some
unexpected -EDQUOT.

Example:
	# btrfs quota enable /root/btrfs-auto-test/
	# btrfs subvolume create /root/btrfs-auto-test//sub
	Create subvolume '/root/btrfs-auto-test/sub'
	# btrfs qgroup limit 1G /root/btrfs-auto-test//sub
	dd if=/dev/zero of=/root/btrfs-auto-test//sub/file bs=1024 count=1500000
	dd: error writing '/root/btrfs-auto-test//sub/file': Disk quota exceeded
	681353+0 records in
	681352+0 records out
	697704448 bytes (698 MB) copied, 8.15563 s, 85.5 MB/s
It's (698 MB) when we got an -EDQUOT, but we limit it by 1G.

This patch move the btrfs_qgroup_reserve/free() for data from
btrfs_delalloc_reserve/release_metadata() to btrfs_check_data_free_space()
and btrfs_free_reserved_data_space(). Then the accounter in qgroup
will be updated at the same time with the accounter in space_info updated.
In this way, the unexpected -EDQUOT will be killed.

Reported-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 16 +++++++++-------
 fs/btrfs/file.c        | 10 +---------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3113e0b79b99..4d3774605a85 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3969,12 +3969,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
+	ret = btrfs_qgroup_reserve(root, bytes);
+	if (ret)
+		goto out;
 	data_sinfo->bytes_may_use += bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 1);
+out:
 	spin_unlock(&data_sinfo->lock);
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -3991,6 +3995,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
 	WARN_ON(data_sinfo->bytes_may_use < bytes);
+	btrfs_qgroup_free(root, bytes);
 	data_sinfo->bytes_may_use -= bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 0);
@@ -5391,8 +5396,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	spin_unlock(&BTRFS_I(inode)->lock);
 
 	if (root->fs_info->quota_enabled) {
-		ret = btrfs_qgroup_reserve(root, num_bytes +
-					   nr_extents * root->nodesize);
+		ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
 		if (ret)
 			goto out_fail;
 	}
@@ -5400,8 +5404,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (unlikely(ret)) {
 		if (root->fs_info->quota_enabled)
-			btrfs_qgroup_free(root, num_bytes +
-						nr_extents * root->nodesize);
+			btrfs_qgroup_free(root, nr_extents * root->nodesize);
 		goto out_fail;
 	}
 
@@ -5522,8 +5525,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
 				      btrfs_ino(inode), to_free, 0);
 	if (root->fs_info->quota_enabled) {
-		btrfs_qgroup_free(root, num_bytes +
-					dropped * root->nodesize);
+		btrfs_qgroup_free(root, dropped * root->nodesize);
 	}
 
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fd105c172c8b..faef1d64394d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2549,7 +2549,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 {
 	struct inode *inode = file_inode(file);
 	struct extent_state *cached_state = NULL;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
@@ -2577,11 +2576,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 	if (ret)
 		return ret;
-	if (root->fs_info->quota_enabled) {
-		ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
-		if (ret)
-			goto out_reserve_fail;
-	}
 
 	mutex_lock(&inode->i_mutex);
 	ret = inode_newsize_ok(inode, alloc_end);
@@ -2674,6 +2668,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		} else if (actual_end > inode->i_size &&
 			   !(mode & FALLOC_FL_KEEP_SIZE)) {
 			struct btrfs_trans_handle *trans;
+			struct btrfs_root *root = BTRFS_I(inode)->root;
 
 			/*
 			 * We didn't need to allocate any more space, but we
@@ -2710,9 +2705,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 			     &cached_state, GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
-	if (root->fs_info->quota_enabled)
-		btrfs_qgroup_free(root, alloc_end - alloc_start);
-out_reserve_fail:
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 	return ret;

From e2d1f92399afb6ec518b68867ed10db2585b283a Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Fri, 6 Feb 2015 10:26:52 -0500
Subject: [PATCH 84/93] btrfs: qgroup: do a reservation in a higher level.

There are two problems in qgroup:

a). The PAGE_CACHE is 4K, even when we are writing a data of 1K,
qgroup will reserve a 4K size. It will cause the last 3K in a qgroup
is not available to user.

b). When user is writing a inline data, qgroup will not reserve it,
it means this is a window we can exceed the limit of a qgroup.

The main idea of this patch is reserving the data size of write_bytes
rather than the reserve_bytes. It means qgroup will not care about
the data size btrfs will reserve for user, but only care about the
data size user is going to write. Then reserve it when user want to
write and release it in transaction committed.

In this way, qgroup can be released from the complex procedure in
btrfs and only do the reserve when user want to write and account
when the data is written in commit_transaction().

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c | 38 ++++-------------------
 fs/btrfs/file.c        |  4 +--
 fs/btrfs/inode.c       | 15 ---------
 fs/btrfs/qgroup.c      | 70 +++---------------------------------------
 fs/btrfs/qgroup.h      |  4 ---
 fs/btrfs/relocation.c  |  2 +-
 7 files changed, 14 insertions(+), 121 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d48b22f31182..851f2355f3c8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3448,7 +3448,7 @@ enum btrfs_reserve_flush_enum {
 	BTRFS_RESERVE_FLUSH_ALL,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4d3774605a85..1eef4ee01d1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3331,7 +3331,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, num_pages);
+	ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
 	if (ret)
 		goto out_put;
 
@@ -3851,7 +3851,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
  * This will check the space that the inode allocates from to make sure we have
  * enough space for bytes.
  */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 {
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3969,7 +3969,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
-	ret = btrfs_qgroup_reserve(root, bytes);
+	ret = btrfs_qgroup_reserve(root, write_bytes);
 	if (ret)
 		goto out;
 	data_sinfo->bytes_may_use += bytes;
@@ -3995,7 +3995,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
 	WARN_ON(data_sinfo->bytes_may_use < bytes);
-	btrfs_qgroup_free(root, bytes);
 	data_sinfo->bytes_may_use -= bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 0);
@@ -5243,8 +5242,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 				      u64 qgroup_reserved)
 {
 	btrfs_block_rsv_release(root, rsv, (u64)-1);
-	if (qgroup_reserved)
-		btrfs_qgroup_free(root, qgroup_reserved);
 }
 
 /**
@@ -5478,11 +5475,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 			to_free = 0;
 	}
 	spin_unlock(&BTRFS_I(inode)->lock);
-	if (dropped) {
-		if (root->fs_info->quota_enabled)
-			btrfs_qgroup_free(root, dropped * root->nodesize);
+	if (dropped)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
-	}
 
 	if (to_free) {
 		btrfs_block_rsv_release(root, block_rsv, to_free);
@@ -5524,9 +5518,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
 				      btrfs_ino(inode), to_free, 0);
-	if (root->fs_info->quota_enabled) {
-		btrfs_qgroup_free(root, dropped * root->nodesize);
-	}
 
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
@@ -5551,7 +5542,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
 	int ret;
 
-	ret = btrfs_check_data_free_space(inode, num_bytes);
+	ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
 	if (ret)
 		return ret;
 
@@ -5727,12 +5718,8 @@ static int pin_down_extent(struct btrfs_root *root,
 
 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
-	if (reserved) {
-		btrfs_qgroup_update_reserved_bytes(root->fs_info,
-						   root->root_key.objectid,
-						   num_bytes, -1);
+	if (reserved)
 		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
-	}
 	return 0;
 }
 
@@ -6470,9 +6457,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		btrfs_put_block_group(cache);
 		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
 		pin = 0;
-		btrfs_qgroup_update_reserved_bytes(root->fs_info,
-						   root->root_key.objectid,
-						   buf->len, -1);
 	}
 out:
 	if (pin)
@@ -7205,9 +7189,6 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 			ret = btrfs_discard_extent(root, start, len, NULL);
 		btrfs_add_free_space(cache, start, len);
 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
-		btrfs_qgroup_update_reserved_bytes(root->fs_info,
-						   root->root_key.objectid,
-						   len, -1);
 	}
 
 	btrfs_put_block_group(cache);
@@ -7446,9 +7427,6 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret); /* logic error */
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
-	btrfs_qgroup_update_reserved_bytes(root->fs_info,
-					   root->root_key.objectid,
-					   ins->offset, 1);
 	btrfs_put_block_group(block_group);
 	return ret;
 }
@@ -7595,10 +7573,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
-	btrfs_qgroup_update_reserved_bytes(root->fs_info,
-					   root_objectid,
-					   ins.offset, 1);
-
 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index faef1d64394d..23b6e03f8465 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1510,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		}
 
 		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-		ret = btrfs_check_data_free_space(inode, reserve_bytes);
+		ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
 		if (ret == -ENOSPC &&
 		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
 					      BTRFS_INODE_PREALLOC))) {
@@ -2573,7 +2573,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 * Make sure we have enough space before we do the
 	 * allocation.
 	 */
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a9f69a0d4b08..27b59b8362f9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -754,9 +754,6 @@ static noinline void submit_compressed_extents(struct inode *inode,
 			}
 			goto out_free;
 		}
-		btrfs_qgroup_update_reserved_bytes(root->fs_info,
-						   root->root_key.objectid,
-						   ins.offset, 1);
 		/*
 		 * here we're doing allocation and writeback of the
 		 * compressed pages
@@ -981,10 +978,6 @@ static noinline int cow_file_range(struct inode *inode,
 		if (ret < 0)
 			goto out_unlock;
 
-		btrfs_qgroup_update_reserved_bytes(root->fs_info,
-						   root->root_key.objectid,
-						   ins.offset, 1);
-
 		em = alloc_extent_map();
 		if (!em) {
 			ret = -ENOMEM;
@@ -7037,10 +7030,6 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 		return ERR_PTR(ret);
 	}
 
-	btrfs_qgroup_update_reserved_bytes(root->fs_info,
-					   root->root_key.objectid,
-					   ins.offset, 1);
-
 	return em;
 }
 
@@ -9595,10 +9584,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 			break;
 		}
 
-		btrfs_qgroup_update_reserved_bytes(root->fs_info,
-						   root->root_key.objectid,
-						   ins.offset, 1);
-
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cd291733dc3e..17881ad8ed96 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -72,7 +72,6 @@ struct btrfs_qgroup {
 	/*
 	 * reservation tracking
 	 */
-	u64 may_use;
 	u64 reserved;
 
 	/*
@@ -2383,67 +2382,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_qgroup_update_reserved_bytes(struct btrfs_fs_info *fs_info,
-					    u64 ref_root,
-					    u64 num_bytes,
-					    int sign)
-{
-	struct btrfs_root *quota_root;
-	struct btrfs_qgroup *qgroup;
-	int ret = 0;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		return 0;
-
-	if (num_bytes == 0)
-		return 0;
-
-	spin_lock(&fs_info->qgroup_lock);
-	quota_root = fs_info->quota_root;
-	if (!quota_root)
-		goto out;
-
-	qgroup = find_qgroup_rb(fs_info, ref_root);
-	if (!qgroup)
-		goto out;
-
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			(uintptr_t)qgroup, GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
-		struct btrfs_qgroup_list *glist;
-
-		qg = u64_to_ptr(unode->aux);
-
-		qg->reserved += sign * num_bytes;
-
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					(uintptr_t)glist->group, GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
-	}
-
-out:
-	spin_unlock(&fs_info->qgroup_lock);
-	return ret;
-}
-
-/*
- * reserve some space for a qgroup and all its parents. The reservation takes
- * place with start_transaction or dealloc_reserve, similar to ENOSPC
- * accounting. If not enough space is available, EDQUOT is returned.
- * We assume that the requested space is new for all qgroups.
- */
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 {
 	struct btrfs_root *quota_root;
@@ -2486,14 +2424,14 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 		qg = u64_to_ptr(unode->aux);
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
-		    qg->reserved + qg->may_use + (s64)qg->rfer + num_bytes >
+		    qg->reserved + (s64)qg->rfer + num_bytes >
 		    qg->max_rfer) {
 			ret = -EDQUOT;
 			goto out;
 		}
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
-		    qg->reserved + qg->may_use + (s64)qg->excl + num_bytes >
+		    qg->reserved + (s64)qg->excl + num_bytes >
 		    qg->max_excl) {
 			ret = -EDQUOT;
 			goto out;
@@ -2517,7 +2455,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 		qg = u64_to_ptr(unode->aux);
 
-		qg->may_use += num_bytes;
+		qg->reserved += num_bytes;
 	}
 
 out:
@@ -2563,7 +2501,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 
 		qg = u64_to_ptr(unode->aux);
 
-		qg->may_use -= num_bytes;
+		qg->reserved -= num_bytes;
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 64d49b8482b3..c5242aa9a4b2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -94,10 +94,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_update_reserved_bytes(struct btrfs_fs_info *fs_info,
-				       u64 ref_root,
-				       u64 num_bytes,
-				       int sign);
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
 void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 840a4eb0f396..74b24b01d574 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3027,7 +3027,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	mutex_lock(&inode->i_mutex);
 
 	ret = btrfs_check_data_free_space(inode, cluster->end +
-					  1 - cluster->start);
+					  1 - cluster->start, 0);
 	if (ret)
 		goto out;
 

From 09870d2772b284d0061a5e4d1e1cdf6fb6764344 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 11 Nov 2014 07:18:22 -0500
Subject: [PATCH 85/93] btrfs: qgroup: return EINVAL if level of parent is not
 higher than child's.

When we create a subvol inheriting a qgroup, we need to check the level
of them. Otherwise, there is a chance a qgroup can inherit another qgroup
at the same level.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 17881ad8ed96..4ec86b3eee21 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2230,6 +2230,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 				ret = -EINVAL;
 				goto out;
 			}
+
+			if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
+				ret = -EINVAL;
+				goto out;
+			}
 			++i_qgroups;
 		}
 	}

From f5a6b1c53bdd44f79e3904c0f5e59f956b49b2c8 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Mon, 24 Nov 2014 10:27:09 -0500
Subject: [PATCH 86/93] btrfs: qgroup: allow to remove qgroup which has parent
 but no child.

When a qgroup has parents but no child, it should be removable in
Theory I think. But currently, we can not remove it when it has
either parent or child.

Example:
	# btrfs quota enable /mnt
	# btrfs qgroup create 1/0 /mnt
	# btrfs qgroup create 2/0 /mnt
	# btrfs qgroup assign 1/0 2/0 /mnt
	# btrfs qgroup show -pcre /mnt
qgroupid rfer  excl  max_rfer max_excl parent  child
-------- ----  ----  -------- -------- ------  -----
0/5      16384 16384 0        0        ---     ---
1/0      0     0     0        0        2/0     ---
2/0      0     0     0        0        ---     1/0

At this time, there is no subvol or qgroup depending on it.
Just a qgroup 2/0 is its parent, but 2/0 can work well without
1/0. So I think 1/0 should be removalbe. But:
	# btrfs qgroup destroy 1/0 /mnt
ERROR: unable to destroy quota group: Device or resource busy

This patch remove the check of qgroup->parent in removing it,
then we can remove a qgroup when it has a parent.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4ec86b3eee21..464d1e237512 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1048,7 +1048,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+int __del_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
 {
 	struct btrfs_root *quota_root;
@@ -1058,7 +1058,6 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int err;
 
-	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
 	if (!quota_root) {
 		ret = -EINVAL;
@@ -1089,7 +1088,18 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
 	del_relation_rb(fs_info, src, dst);
 	spin_unlock(&fs_info->qgroup_lock);
 out:
+	return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	ret = __del_qgroup_relation(trans, fs_info, src, dst);
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
 	return ret;
 }
 
@@ -1132,6 +1142,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup_list *list;
 	int ret = 0;
 
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
@@ -1146,15 +1157,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
 		ret = -ENOENT;
 		goto out;
 	} else {
-		/* check if there are no relations to this qgroup */
-		if (!list_empty(&qgroup->groups) ||
-		    !list_empty(&qgroup->members)) {
+		/* check if there are no children of this qgroup */
+		if (!list_empty(&qgroup->members)) {
 			ret = -EBUSY;
 			goto out;
 		}
 	}
 	ret = del_qgroup_item(trans, quota_root, qgroupid);
 
+	while (!list_empty(&qgroup->groups)) {
+		list = list_first_entry(&qgroup->groups,
+					struct btrfs_qgroup_list, next_group);
+		ret = __del_qgroup_relation(trans, fs_info,
+					   qgroupid,
+					   list->group->qgroupid);
+		if (ret)
+			goto out;
+	}
+
 	spin_lock(&fs_info->qgroup_lock);
 	del_qgroup_rb(quota_root->fs_info, qgroupid);
 	spin_unlock(&fs_info->qgroup_lock);

From 8465ecec9611d60cbbc8e374ecf68453e0dd5b50 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:22 +0800
Subject: [PATCH 87/93] btrfs: Check qgroup level in kernel qgroup assign.

Although we have qgroup level check in btrfs-progs, it's not enough
since other programe may still call ioctl directly not using
btrfs-progs. For example, systemd.

But it's btrfs-progs to be blame since we don't provide a
full-function(like subvolume create things) btrfs library with enough
check, and only rely on kernel ioctl.

So Add level checks in kernel too.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h  | 6 ++++++
 fs/btrfs/qgroup.c | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 851f2355f3c8..8b851ac7c3fa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1061,6 +1061,12 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+#define BTRFS_QGROUP_LEVEL_SHIFT		48
+static inline u64 btrfs_qgroup_level(u64 qgroupid)
+{
+	return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
+
 /*
  * is subvolume quota turned on?
  */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 464d1e237512..4fb9610beec7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1009,6 +1009,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup_list *list;
 	int ret = 0;
 
+	/* Check the level of src and dst first */
+	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+		return -EINVAL;
+
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
 	if (!quota_root) {

From e09fe2d2119800e6060f9b8ba71e072a0eb0fa4d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:23 +0800
Subject: [PATCH 88/93] btrfs: Don't allow subvolid >= (1 <<
 BTRFS_QGROUP_LEVEL_SHIFT) to be created

Btrfs will create qgroup on subvolume creation if quota is enabled, but
qgroup uses the high bits(currently 16 bits) as level, to build the
inheritance.

However it is fully possible a subvolume can be created with a
subvolumeid larger than 1 << BTRFS_QGROUP_LEVEL_SHIFT, so it will be
considered as level 1 and can't be assigned to other qgroup in level 1.

This patch will prevent such things so qgroup inheritance will not be
screwed up.
The downside is very clear, btrfs subvolume number limit will decrease
from (u64 max - 256(fisrt free objectid) - 256(last free objectid)) to
(u48 max -256(first free objectid)).
But we still have near u48(that's 15 digits in dec), so that should not
be a huge problem.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 3 ++-
 fs/btrfs/ioctl.c | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8b851ac7c3fa..6f364e1d8d3d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4219,7 +4219,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
-	    (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+	    ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+	      !btrfs_qgroup_level(rootid)))
 		return 1;
 	return 0;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2e66d8e76f32..e38b645c5015 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -456,6 +456,13 @@ static noinline int create_subvol(struct inode *dir,
 	if (ret)
 		return ret;
 
+	/*
+	 * Don't create subvolume whose level is not zero. Or qgroup will be
+	 * screwed up since it assume subvolme qgroup's level to be 0.
+	 */
+	if (btrfs_qgroup_level(objectid))
+		return -ENOSPC;
+
 	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
 	 * The same as the snapshot creation, please see the comment

From 3393168d22fd5f1be5b5429a818c10f642e88ae3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:24 +0800
Subject: [PATCH 89/93] btrfs: qgroup: Fix dead judgement on
 qgroup_rescan_leaf() return value.

Old qgroup_rescan_leaf() comment indicates ret == 2 as complete and
cleared INCONSISTENT flag.

This is not true since it will never return 2, and inside it no codes
will clear INCONSISTENT flag.
The flag clearance is done in btrfs_qgroup_rescan_work().
This caused the bug that INCONSISTENT flag is never cleared.

So change the comment and fix the dead judgment.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4fb9610beec7..44192370310b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2560,7 +2560,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 
 /*
  * returns < 0 on error, 0 when more leafs are to be scanned.
- * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ * returns 1 when done.
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
@@ -2707,7 +2707,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 
-	if (err == 2 &&
+	if (err > 0 &&
 	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 	} else if (err < 0) {
@@ -2717,7 +2717,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 
 	if (err >= 0) {
 		btrfs_info(fs_info, "qgroup scan completed%s",
-			err == 2 ? " (inconsistency flag cleared)" : "");
+			err > 0 ? " (inconsistency flag cleared)" : "");
 	} else {
 		btrfs_err(fs_info, "qgroup scan failed with %d", err);
 	}

From 53b7cde9d5aa58cf7605664f0e34419156b02698 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:25 +0800
Subject: [PATCH 90/93] btrfs: Update btrfs qgroup status item when rescan is
 done.

Update qgroup status when rescan is done.

Before this patch, status item is not updated on rescan finish, which
causing the RESCAN and INCONSISTENT flags never cleared.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 44192370310b..2e04e99f1e5b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2665,6 +2665,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	struct ulist *tmp = NULL, *qgroups = NULL;
 	struct extent_buffer *scratch_leaf = NULL;
 	int err = -ENOMEM;
+	int ret = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -2715,6 +2716,28 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	}
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
+	/*
+	 * only update status, since the previous part has alreay updated the
+	 * qgroup info.
+	 */
+	trans = btrfs_start_transaction(fs_info->quota_root, 1);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		btrfs_err(fs_info,
+			  "fail to start transaction for status update: %d\n",
+			  err);
+		goto done;
+	}
+	ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
+	if (ret < 0) {
+		err = ret;
+		btrfs_err(fs_info, "fail to update qgroup status: %d\n",
+			  err);
+		btrfs_abort_transaction(trans, fs_info->quota_root, err);
+		goto done;
+	}
+	btrfs_end_transaction(trans, fs_info->quota_root);
+
 	if (err >= 0) {
 		btrfs_info(fs_info, "qgroup scan completed%s",
 			err > 0 ? " (inconsistency flag cleared)" : "");
@@ -2722,6 +2745,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 		btrfs_err(fs_info, "qgroup scan failed with %d", err);
 	}
 
+done:
 	complete_all(&fs_info->qgroup_rescan_completion);
 }
 
@@ -2756,7 +2780,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 			mutex_unlock(&fs_info->qgroup_rescan_lock);
 			goto err;
 		}
-
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 	}
 

From 8ea0ec9e011eb542a3e7b1171776aa4877cf8a90 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:26 +0800
Subject: [PATCH 91/93] btrfs: qgroup: clear STATUS_FLAG_ON in disabling quota.

we forgot to clear STATUS_FLAG_ON in quota_disable(), it
will cause a problem shown as below:

	# mount /dev/sdc /mnt
	# btrfs quota enable /mnt
	# btrfs quota disable /mnt
	# btrfs quota rescan /mnt
	quota rescan started <--- expecting it fail here.
	# echo $?
	0

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2e04e99f1e5b..f5f2baae7aa3 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -966,6 +966,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	fs_info->pending_quota_state = 0;
 	quota_root = fs_info->quota_root;
 	fs_info->quota_root = NULL;
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
 	spin_unlock(&fs_info->qgroup_lock);
 
 	btrfs_free_qgroup_config(fs_info);

From 9c8b35b1ba21bbf7527056f394aa6109424c55ef Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:27 +0800
Subject: [PATCH 92/93] btrfs: quota: Automatically update related qgroups or
 mark INCONSISTENT flags when assigning/deleting a qgroup relations.

Operation like qgroups assigning/deleting qgroup relations will mostly
cause qgroup data inconsistent, since it needs to do the full rescan to
determine whether shared extents are exclusive or still shared in
parent qgroups.

But there are some exceptions, like qgroup with only exclusive extents
(qgroup->excl == qgroup->rfer), in that case, we only needs to
modify all its parents' excl and rfer.

So this patch adds a quick path for such qgroup in qgroup
assign/remove routine, and if quick path failed, the qgroup status will
be marked INCONSISTENT, and return 1 to info user-land.

BTW since the quick path is much the same of qgroup_excl_accounting(),
so move the core of it to __qgroup_excl_accounting() and reuse it.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 183 +++++++++++++++++++++++++++++++---------------
 1 file changed, 126 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f5f2baae7aa3..3d6546581bb9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1001,6 +1001,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
 		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
 
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+				    struct ulist *tmp, u64 ref_root,
+				    u64 num_bytes, int sign)
+{
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup_list *glist;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret = 0;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	qgroup->rfer += sign * num_bytes;
+	qgroup->rfer_cmpr += sign * num_bytes;
+
+	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+	qgroup->excl += sign * num_bytes;
+	qgroup->excl_cmpr += sign * num_bytes;
+	if (sign > 0)
+		qgroup->reserved -= num_bytes;
+
+	qgroup_dirty(fs_info, qgroup);
+
+	/* Get all of the parent groups that contain this qgroup */
+	list_for_each_entry(glist, &qgroup->groups, next_group) {
+		ret = ulist_add(tmp, glist->group->qgroupid,
+				ptr_to_u64(glist->group), GFP_ATOMIC);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* Iterate all of the parents and adjust their reference counts */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		qgroup = u64_to_ptr(unode->aux);
+		qgroup->rfer += sign * num_bytes;
+		qgroup->rfer_cmpr += sign * num_bytes;
+		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+		qgroup->excl += sign * num_bytes;
+		if (sign > 0)
+			qgroup->reserved -= num_bytes;
+		qgroup->excl_cmpr += sign * num_bytes;
+		qgroup_dirty(fs_info, qgroup);
+
+		/* Add any parents of the parents */
+		list_for_each_entry(glist, &qgroup->groups, next_group) {
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					ptr_to_u64(glist->group), GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+				   struct ulist *tmp, u64 src, u64 dst,
+				   int sign)
+{
+	struct btrfs_qgroup *qgroup;
+	int ret = 1;
+	int err = 0;
+
+	qgroup = find_qgroup_rb(fs_info, src);
+	if (!qgroup)
+		goto out;
+	if (qgroup->excl == qgroup->rfer) {
+		ret = 0;
+		err = __qgroup_excl_accounting(fs_info, tmp, dst,
+					       qgroup->excl, sign);
+		if (err < 0) {
+			ret = err;
+			goto out;
+		}
+	}
+out:
+	if (ret)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	return ret;
+}
+
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
 {
@@ -1008,8 +1112,13 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup *parent;
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
+	struct ulist *tmp;
 	int ret = 0;
 
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
 	/* Check the level of src and dst first */
 	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
 		return -EINVAL;
@@ -1047,9 +1156,15 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 
 	spin_lock(&fs_info->qgroup_lock);
 	ret = add_relation_rb(quota_root->fs_info, src, dst);
+	if (ret < 0) {
+		spin_unlock(&fs_info->qgroup_lock);
+		goto out;
+	}
+	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
 	spin_unlock(&fs_info->qgroup_lock);
 out:
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	ulist_free(tmp);
 	return ret;
 }
 
@@ -1060,9 +1175,14 @@ int __del_qgroup_relation(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup *parent;
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
+	struct ulist *tmp;
 	int ret = 0;
 	int err;
 
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
 	quota_root = fs_info->quota_root;
 	if (!quota_root) {
 		ret = -EINVAL;
@@ -1091,8 +1211,10 @@ int __del_qgroup_relation(struct btrfs_trans_handle *trans,
 
 	spin_lock(&fs_info->qgroup_lock);
 	del_relation_rb(fs_info, src, dst);
+	ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
 	spin_unlock(&fs_info->qgroup_lock);
 out:
+	ulist_free(tmp);
 	return ret;
 }
 
@@ -1400,19 +1522,10 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
- * exclusive counts adjusted.
- */
 static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 				  struct btrfs_qgroup_operation *oper)
 {
-	struct btrfs_qgroup *qgroup;
 	struct ulist *tmp;
-	struct btrfs_qgroup_list *glist;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
 	int sign = 0;
 	int ret = 0;
 
@@ -1423,9 +1536,7 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	spin_lock(&fs_info->qgroup_lock);
 	if (!fs_info->quota_root)
 		goto out;
-	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
-	if (!qgroup)
-		goto out;
+
 	switch (oper->type) {
 	case BTRFS_QGROUP_OPER_ADD_EXCL:
 		sign = 1;
@@ -1436,47 +1547,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	default:
 		ASSERT(0);
 	}
-	qgroup->rfer += sign * oper->num_bytes;
-	qgroup->rfer_cmpr += sign * oper->num_bytes;
-
-	WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
-	qgroup->excl += sign * oper->num_bytes;
-	qgroup->excl_cmpr += sign * oper->num_bytes;
-	if (sign > 0)
-		qgroup->reserved -= oper->num_bytes;
-
-	qgroup_dirty(fs_info, qgroup);
-
-	/* Get all of the parent groups that contain this qgroup */
-	list_for_each_entry(glist, &qgroup->groups, next_group) {
-		ret = ulist_add(tmp, glist->group->qgroupid,
-				ptr_to_u64(glist->group), GFP_ATOMIC);
-		if (ret < 0)
-			goto out;
-	}
-
-	/* Iterate all of the parents and adjust their reference counts */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		qgroup = u64_to_ptr(unode->aux);
-		qgroup->rfer += sign * oper->num_bytes;
-		qgroup->rfer_cmpr += sign * oper->num_bytes;
-		WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
-		qgroup->excl += sign * oper->num_bytes;
-		if (sign > 0)
-			qgroup->reserved -= oper->num_bytes;
-		qgroup->excl_cmpr += sign * oper->num_bytes;
-		qgroup_dirty(fs_info, qgroup);
-
-		/* Add any parents of the parents */
-		list_for_each_entry(glist, &qgroup->groups, next_group) {
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
-	}
-	ret = 0;
+	ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
+				       oper->num_bytes, sign);
 out:
 	spin_unlock(&fs_info->qgroup_lock);
 	ulist_free(tmp);
@@ -2732,10 +2804,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
 	if (ret < 0) {
 		err = ret;
-		btrfs_err(fs_info, "fail to update qgroup status: %d\n",
-			  err);
-		btrfs_abort_transaction(trans, fs_info->quota_root, err);
-		goto done;
+		btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
 	}
 	btrfs_end_transaction(trans, fs_info->quota_root);
 

From e082f56313f374d723b0366978ddb062c8fe79ea Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 27 Feb 2015 16:24:28 +0800
Subject: [PATCH 93/93] btrfs: quota: Update quota tree after qgroup
 relationship change.

Previous patch modified the in memory struct but it's not written in
quota tree until next commit.
So user will still get old data using "btrfs qgroup show" after
assign/remove.

This patch will call btrfs_run_qgroups in assign ioctl so it will be
updated to in memory quota trees and user will get up-to-date results.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e38b645c5015..ca5d968f4c37 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4641,6 +4641,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 						sa->src, sa->dst);
 	}
 
+	/* update qgroup status and info */
+	err = btrfs_run_qgroups(trans, root->fs_info);
+	if (err < 0)
+		btrfs_error(root->fs_info, ret,
+			    "failed to update qgroup status and info\n");
 	err = btrfs_end_transaction(trans, root);
 	if (err && !ret)
 		ret = err;