From cf5388307a2b4faab4b11d732b61c85741be6169 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Wed, 4 Jul 2012 15:42:48 +0200
Subject: [PATCH 01/15] Btrfs: fix buffer leak in btrfs_next_old_leaf

When calling btrfs_next_old_leaf, we were leaking an extent buffer in the
rare case of using the deadlock avoidance code needed for the tree mod log.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b3900587..67fe46fdee6f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5127,6 +5127,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 				 * locked. To solve this situation, we give up
 				 * on our lock and cycle.
 				 */
+				free_extent_buffer(next);
 				btrfs_release_path(path);
 				cond_resched();
 				goto again;

From 097b8a7c9e48e2cb50fd0eb9315791921beaf484 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 21 Jun 2012 11:08:04 +0200
Subject: [PATCH 02/15] Btrfs: join tree mod log code with the code holding
 back delayed refs

We've got two mechanisms both required for reliable backref resolving (tree
mod log and holding back delayed refs). You cannot make use of one without
the other. So instead of requiring the user of this mechanism to setup both
correctly, we join them into a single interface.

Additionally, we stop inserting non-blockers into fs_info->tree_mod_seq_list
as we did before, which was of no value.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c     |  30 ++---
 fs/btrfs/backref.h     |   3 +-
 fs/btrfs/ctree.c       | 275 +++++++++++++++++++++++++----------------
 fs/btrfs/ctree.h       |  31 +++--
 fs/btrfs/delayed-ref.c |  42 ++++---
 fs/btrfs/delayed-ref.h |  49 +-------
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/extent-tree.c |  21 ++--
 fs/btrfs/transaction.c |   4 -
 9 files changed, 239 insertions(+), 218 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a383c18e74e8..7d80ddd8f544 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 delayed_ref_seq, u64 time_seq,
-			     struct ulist *refs, struct ulist *roots,
-			     const u64 *extent_item_pos)
+			     u64 time_seq, struct ulist *refs,
+			     struct ulist *roots, const u64 *extent_item_pos)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -837,7 +836,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 				btrfs_put_delayed_ref(&head->node);
 				goto again;
 			}
-			ret = __add_delayed_refs(head, delayed_ref_seq,
+			ret = __add_delayed_refs(head, time_seq,
 						 &prefs_delayed);
 			mutex_unlock(&head->mutex);
 			if (ret) {
@@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks)
  */
 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info, u64 bytenr,
-				u64 delayed_ref_seq, u64 time_seq,
-				struct ulist **leafs,
+				u64 time_seq, struct ulist **leafs,
 				const u64 *extent_item_pos)
 {
 	struct ulist *tmp;
@@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+	ret = find_parent_nodes(trans, fs_info, bytenr,
 				time_seq, *leafs, tmp, extent_item_pos);
 	ulist_free(tmp);
 
@@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  */
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info, u64 bytenr,
-				u64 delayed_ref_seq, u64 time_seq,
-				struct ulist **roots)
+				u64 time_seq, struct ulist **roots)
 {
 	struct ulist *tmp;
 	struct ulist_node *node = NULL;
@@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
-		ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+		ret = find_parent_nodes(trans, fs_info, bytenr,
 					time_seq, tmp, *roots, NULL);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
@@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 	struct ulist *roots = NULL;
 	struct ulist_node *ref_node = NULL;
 	struct ulist_node *root_node = NULL;
-	struct seq_list seq_elem = {};
 	struct seq_list tree_mod_seq_elem = {};
 	struct ulist_iterator ref_uiter;
 	struct ulist_iterator root_uiter;
-	struct btrfs_delayed_ref_root *delayed_refs = NULL;
 
 	pr_debug("resolving all inodes for extent %llu\n",
 			extent_item_objectid);
@@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		trans = btrfs_join_transaction(fs_info->extent_root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
-
-		delayed_refs = &trans->transaction->delayed_refs;
-		spin_lock(&delayed_refs->lock);
-		btrfs_get_delayed_seq(delayed_refs, &seq_elem);
-		spin_unlock(&delayed_refs->lock);
 		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 	}
 
 	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-				   seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+				   tree_mod_seq_elem.seq, &refs,
 				   &extent_item_pos);
 	if (ret)
 		goto out;
@@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
 		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
-						seq_elem.seq,
-						tree_mod_seq_elem.seq, &roots);
+					   tree_mod_seq_elem.seq, &roots);
 		if (ret)
 			break;
 		ULIST_ITER_INIT(&root_uiter);
@@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 out:
 	if (!search_commit_root) {
 		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-		btrfs_put_delayed_seq(delayed_refs, &seq_elem);
 		btrfs_end_transaction(trans, fs_info->extent_root);
 	}
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac7b795..3a1ad3e2dcb0 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,8 +58,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info, u64 bytenr,
-				u64 delayed_ref_seq, u64 time_seq,
-				struct ulist **roots);
+				u64 time_seq, struct ulist **roots);
 
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 67fe46fdee6f..bef68ab32204 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -321,7 +321,7 @@ struct tree_mod_root {
 struct tree_mod_elem {
 	struct rb_node node;
 	u64 index;		/* shifted logical */
-	struct seq_list elem;
+	u64 seq;
 	enum mod_log_op op;
 
 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +341,50 @@ struct tree_mod_elem {
 	struct tree_mod_root old_root;
 };
 
-static inline void
-__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 {
-	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
-	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+	read_lock(&fs_info->tree_mod_log_lock);
 }
 
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
-			    struct seq_list *elem)
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
 {
-	elem->flags = 1;
+	read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+	write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+	write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem)
+{
+	u64 seq;
+
+	tree_mod_log_write_lock(fs_info);
 	spin_lock(&fs_info->tree_mod_seq_lock);
-	__get_tree_mod_seq(fs_info, elem);
+	if (!elem->seq) {
+		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+	}
+	seq = btrfs_inc_tree_mod_seq(fs_info);
 	spin_unlock(&fs_info->tree_mod_seq_lock);
+	tree_mod_log_write_unlock(fs_info);
+
+	return seq;
 }
 
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 	if (!seq_putting)
 		return;
 
-	BUG_ON(!(elem->flags & 1));
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	list_del(&elem->list);
+	elem->seq = 0;
 
 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
-		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+		if (cur_elem->seq < min_seq) {
 			if (seq_putting > cur_elem->seq) {
 				/*
 				 * blocker with lower sequence number exists, we
 				 * cannot remove anything from the log
 				 */
-				goto out;
+				spin_unlock(&fs_info->tree_mod_seq_lock);
+				return;
 			}
 			min_seq = cur_elem->seq;
 		}
 	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	/*
+	 * we removed the lowest blocker from the blocker list, so there may be
+	 * more processible delayed refs.
+	 */
+	wake_up(&fs_info->tree_mod_seq_wait);
 
 	/*
 	 * anything that's lower than the lowest existing (read: blocked)
 	 * sequence number can be removed from the tree.
 	 */
-	write_lock(&fs_info->tree_mod_log_lock);
+	tree_mod_log_write_lock(fs_info);
 	tm_root = &fs_info->tree_mod_log;
 	for (node = rb_first(tm_root); node; node = next) {
 		next = rb_next(node);
 		tm = container_of(node, struct tree_mod_elem, node);
-		if (tm->elem.seq > min_seq)
+		if (tm->seq > min_seq)
 			continue;
 		rb_erase(node, tm_root);
-		list_del(&tm->elem.list);
 		kfree(tm);
 	}
-	write_unlock(&fs_info->tree_mod_log_lock);
-out:
-	spin_unlock(&fs_info->tree_mod_seq_lock);
+	tree_mod_log_write_unlock(fs_info);
 }
 
 /*
@@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 	struct rb_node **new;
 	struct rb_node *parent = NULL;
 	struct tree_mod_elem *cur;
-	int ret = 0;
 
-	BUG_ON(!tm || !tm->elem.seq);
+	BUG_ON(!tm || !tm->seq);
 
-	write_lock(&fs_info->tree_mod_log_lock);
 	tm_root = &fs_info->tree_mod_log;
 	new = &tm_root->rb_node;
 	while (*new) {
@@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 			new = &((*new)->rb_left);
 		else if (cur->index > tm->index)
 			new = &((*new)->rb_right);
-		else if (cur->elem.seq < tm->elem.seq)
+		else if (cur->seq < tm->seq)
 			new = &((*new)->rb_left);
-		else if (cur->elem.seq > tm->elem.seq)
+		else if (cur->seq > tm->seq)
 			new = &((*new)->rb_right);
 		else {
 			kfree(tm);
-			ret = -EEXIST;
-			goto unlock;
+			return -EEXIST;
 		}
 	}
 
 	rb_link_node(&tm->node, parent, new);
 	rb_insert_color(&tm->node, tm_root);
-unlock:
-	write_unlock(&fs_info->tree_mod_log_lock);
-	return ret;
+	return 0;
 }
 
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 				    struct extent_buffer *eb) {
 	smp_mb();
 	if (list_empty(&(fs_info)->tree_mod_seq_list))
 		return 1;
-	if (!eb)
-		return 0;
-	if (btrfs_header_level(eb) == 0)
+	if (eb && btrfs_header_level(eb) == 0)
 		return 1;
+
+	tree_mod_log_write_lock(fs_info);
+	if (list_empty(&fs_info->tree_mod_seq_list)) {
+		/*
+		 * someone emptied the list while we were waiting for the lock.
+		 * we must not add to the list when no blocker exists.
+		 */
+		tree_mod_log_write_unlock(fs_info);
+		return 1;
+	}
+
 	return 0;
 }
 
 /*
- * This allocates memory and gets a tree modification sequence number when
- * needed.
+ * This allocates memory and gets a tree modification sequence number.
  *
- * Returns 0 when no sequence number is needed, < 0 on error.
- * Returns 1 when a sequence number was added. In this case,
- * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
- * after inserting into the rb tree.
+ * Returns <0 on error.
+ * Returns >0 (the added sequence number) on success.
  */
 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
 				 struct tree_mod_elem **tm_ret)
 {
 	struct tree_mod_elem *tm;
-	int seq;
 
-	if (tree_mod_dont_log(fs_info, NULL))
-		return 0;
-
-	tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+	/*
+	 * once we switch from spin locks to something different, we should
+	 * honor the flags parameter here.
+	 */
+	tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
 	if (!tm)
 		return -ENOMEM;
 
-	tm->elem.flags = 0;
-	spin_lock(&fs_info->tree_mod_seq_lock);
-	if (list_empty(&fs_info->tree_mod_seq_list)) {
-		/*
-		 * someone emptied the list while we were waiting for the lock.
-		 * we must not add to the list, because no blocker exists. items
-		 * are removed from the list only when the existing blocker is
-		 * removed from the list.
-		 */
-		kfree(tm);
-		seq = 0;
-		spin_unlock(&fs_info->tree_mod_seq_lock);
-	} else {
-		__get_tree_mod_seq(fs_info, &tm->elem);
-		seq = tm->elem.seq;
-	}
-
-	return seq;
+	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+	return tm->seq;
 }
 
-static noinline int
-tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
-			     struct extent_buffer *eb, int slot,
-			     enum mod_log_op op, gfp_t flags)
+static inline int
+__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *eb, int slot,
+			  enum mod_log_op op, gfp_t flags)
 {
-	struct tree_mod_elem *tm;
 	int ret;
+	struct tree_mod_elem *tm;
 
 	ret = tree_mod_alloc(fs_info, flags, &tm);
-	if (ret <= 0)
+	if (ret < 0)
 		return ret;
 
 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
 	tm->slot = slot;
 	tm->generation = btrfs_node_ptr_generation(eb, slot);
 
-	ret = __tree_mod_log_insert(fs_info, tm);
-	spin_unlock(&fs_info->tree_mod_seq_lock);
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+			     struct extent_buffer *eb, int slot,
+			     enum mod_log_op op, gfp_t flags)
+{
+	int ret;
+
+	if (tree_mod_dont_log(fs_info, eb))
+		return 0;
+
+	ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+
+	tree_mod_log_write_unlock(fs_info);
 	return ret;
 }
 
@@ -542,6 +582,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
 }
 
+static noinline int
+tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
+			     struct extent_buffer *eb, int slot,
+			     enum mod_log_op op)
+{
+	return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
+}
+
 static noinline int
 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 			 struct extent_buffer *eb, int dst_slot, int src_slot,
@@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 		return 0;
 
 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-		ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+		ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
 					      MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 		BUG_ON(ret < 0);
 	}
 
 	ret = tree_mod_alloc(fs_info, flags, &tm);
-	if (ret <= 0)
-		return ret;
+	if (ret < 0)
+		goto out;
 
 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
 	tm->slot = src_slot;
@@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	tm->op = MOD_LOG_MOVE_KEYS;
 
 	ret = __tree_mod_log_insert(fs_info, tm);
-	spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+	tree_mod_log_write_unlock(fs_info);
 	return ret;
 }
 
+static inline void
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+	int i;
+	u32 nritems;
+	int ret;
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = nritems - 1; i >= 0; i--) {
+		ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
+					      MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+		BUG_ON(ret < 0);
+	}
+}
+
 static noinline int
 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 			 struct extent_buffer *old_root,
@@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	struct tree_mod_elem *tm;
 	int ret;
 
+	if (tree_mod_dont_log(fs_info, NULL))
+		return 0;
+
+	__tree_mod_log_free_eb(fs_info, old_root);
+
 	ret = tree_mod_alloc(fs_info, flags, &tm);
-	if (ret <= 0)
-		return ret;
+	if (ret < 0)
+		goto out;
 
 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
 	tm->old_root.logical = old_root->start;
@@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	tm->op = MOD_LOG_ROOT_REPLACE;
 
 	ret = __tree_mod_log_insert(fs_info, tm);
-	spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+	tree_mod_log_write_unlock(fs_info);
 	return ret;
 }
 
@@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 	struct tree_mod_elem *found = NULL;
 	u64 index = start >> PAGE_CACHE_SHIFT;
 
-	read_lock(&fs_info->tree_mod_log_lock);
+	tree_mod_log_read_lock(fs_info);
 	tm_root = &fs_info->tree_mod_log;
 	node = tm_root->rb_node;
 	while (node) {
@@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 			node = node->rb_left;
 		} else if (cur->index > index) {
 			node = node->rb_right;
-		} else if (cur->elem.seq < min_seq) {
+		} else if (cur->seq < min_seq) {
 			node = node->rb_left;
 		} else if (!smallest) {
 			/* we want the node with the highest seq */
 			if (found)
-				BUG_ON(found->elem.seq > cur->elem.seq);
+				BUG_ON(found->seq > cur->seq);
 			found = cur;
 			node = node->rb_left;
-		} else if (cur->elem.seq > min_seq) {
+		} else if (cur->seq > min_seq) {
 			/* we want the node with the smallest seq */
 			if (found)
-				BUG_ON(found->elem.seq < cur->elem.seq);
+				BUG_ON(found->seq < cur->seq);
 			found = cur;
 			node = node->rb_right;
 		} else {
@@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 			break;
 		}
 	}
-	read_unlock(&fs_info->tree_mod_log_lock);
+	tree_mod_log_read_unlock(fs_info);
 
 	return found;
 }
@@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
 }
 
-static inline void
+static noinline void
 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 		     struct extent_buffer *src, unsigned long dst_offset,
 		     unsigned long src_offset, int nr_items)
@@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 	if (tree_mod_dont_log(fs_info, NULL))
 		return;
 
-	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
+		tree_mod_log_write_unlock(fs_info);
 		return;
+	}
 
-	/* speed this up by single seq for all operations? */
 	for (i = 0; i < nr_items; i++) {
-		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
-					      MOD_LOG_KEY_REMOVE);
+		ret = tree_mod_log_insert_key_locked(fs_info, src,
+						     i + src_offset,
+						     MOD_LOG_KEY_REMOVE);
 		BUG_ON(ret < 0);
-		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
-					      MOD_LOG_KEY_ADD);
+		ret = tree_mod_log_insert_key_locked(fs_info, dst,
+						     i + dst_offset,
+						     MOD_LOG_KEY_ADD);
 		BUG_ON(ret < 0);
 	}
+
+	tree_mod_log_write_unlock(fs_info);
 }
 
 static inline void
@@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 	BUG_ON(ret < 0);
 }
 
-static inline void
+static noinline void
 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 			  struct extent_buffer *eb,
 			  struct btrfs_disk_key *disk_key, int slot, int atomic)
@@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 	BUG_ON(ret < 0);
 }
 
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
-				 struct extent_buffer *eb)
+static noinline void
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
-	int i;
-	int ret;
-	u32 nritems;
-
 	if (tree_mod_dont_log(fs_info, eb))
 		return;
 
-	nritems = btrfs_header_nritems(eb);
-	for (i = nritems - 1; i >= 0; i--) {
-		ret = tree_mod_log_insert_key(fs_info, eb, i,
-					      MOD_LOG_KEY_REMOVE_WHILE_FREEING);
-		BUG_ON(ret < 0);
-	}
+	__tree_mod_log_free_eb(fs_info, eb);
+
+	tree_mod_log_write_unlock(fs_info);
 }
 
-static inline void
+static noinline void
 tree_mod_log_set_root_pointer(struct btrfs_root *root,
 			      struct extent_buffer *new_root_node)
 {
 	int ret;
-	tree_mod_log_free_eb(root->fs_info, root->node);
 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
 				       new_root_node, GFP_NOFS);
 	BUG_ON(ret < 0);
@@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
 	n = btrfs_header_nritems(eb);
-	while (tm && tm->elem.seq >= time_seq) {
+	while (tm && tm->seq >= time_seq) {
 		/*
 		 * all the operations are recorded with the operator used for
 		 * the modification. as we're going backwards, we do the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84ac723f58f8..8f8dc46f44e7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1030,6 +1030,13 @@ struct btrfs_block_group_cache {
 	struct list_head cluster_list;
 };
 
+/* delayed seq elem */
+struct seq_list {
+	struct list_head list;
+	u64 seq;
+};
+
+/* fs_info */
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
@@ -1144,6 +1151,8 @@ struct btrfs_fs_info {
 	spinlock_t tree_mod_seq_lock;
 	atomic_t tree_mod_seq;
 	struct list_head tree_mod_seq_list;
+	struct seq_list tree_mod_seq_elem;
+	wait_queue_head_t tree_mod_seq_wait;
 
 	/* this protects tree_mod_log */
 	rwlock_t tree_mod_log_lock;
@@ -2798,6 +2807,16 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info);
 }
 
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem);
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+	return atomic_inc_return(&fs_info->tree_mod_seq);
+}
+
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 			struct btrfs_path *path,
@@ -3157,18 +3176,6 @@ void btrfs_reada_detach(void *handle);
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 			 u64 start, int err);
 
-/* delayed seq elem */
-struct seq_list {
-	struct list_head list;
-	u64 seq;
-	u32 flags;
-};
-
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
-			    struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
-			    struct seq_list *elem);
-
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b04790e..21a757717637 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
 {
 	struct seq_list *elem;
+	int ret = 0;
 
-	assert_spin_locked(&delayed_refs->lock);
-	if (list_empty(&delayed_refs->seq_head))
-		return 0;
-
-	elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
-	if (seq >= elem->seq) {
-		pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
-			 seq, elem->seq, delayed_refs);
-		return 1;
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		if (seq >= elem->seq) {
+			pr_debug("holding back delayed_ref %llu, lowest is "
+				 "%llu (%p)\n", seq, elem->seq, delayed_refs);
+			ret = 1;
+		}
 	}
-	return 0;
+
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	return ret;
 }
 
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -526,7 +530,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->in_tree = 1;
 
 	if (is_fstree(ref_root))
-		seq = inc_delayed_seq(delayed_refs);
+		seq = btrfs_inc_tree_mod_seq(fs_info);
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -585,7 +589,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->in_tree = 1;
 
 	if (is_fstree(ref_root))
-		seq = inc_delayed_seq(delayed_refs);
+		seq = btrfs_inc_tree_mod_seq(fs_info);
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -659,8 +663,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 				   num_bytes, parent, ref_root, level, action,
 				   for_cow);
 	if (!is_fstree(ref_root) &&
-	    waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
+	    waitqueue_active(&fs_info->tree_mod_seq_wait))
+		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
@@ -708,8 +712,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 				   num_bytes, parent, ref_root, owner, offset,
 				   action, for_cow);
 	if (!is_fstree(ref_root) &&
-	    waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
+	    waitqueue_active(&fs_info->tree_mod_seq_wait))
+		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
@@ -736,8 +740,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
 
-	if (waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
+	if (waitqueue_active(&fs_info->tree_mod_seq_wait))
+		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927fb9957..2b5cb27f9861 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
 	int flushing;
 
 	u64 run_delayed_start;
-
-	/*
-	 * seq number of delayed refs. We need to know if a backref was being
-	 * added before the currently processed ref or afterwards.
-	 */
-	u64 seq;
-
-	/*
-	 * seq_list holds a list of all seq numbers that are currently being
-	 * added to the list. While walking backrefs (btrfs_find_all_roots,
-	 * qgroups), which might take some time, no newer ref must be processed,
-	 * as it might influence the outcome of the walk.
-	 */
-	struct list_head seq_head;
-
-	/*
-	 * when the only refs we have in the list must not be processed, we want
-	 * to wait for more refs to show up or for the end of backref walking.
-	 */
-	wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -195,33 +175,8 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 search_start);
 
-static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
-{
-	assert_spin_locked(&delayed_refs->lock);
-	++delayed_refs->seq;
-	return delayed_refs->seq;
-}
-
-static inline void
-btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-		      struct seq_list *elem)
-{
-	assert_spin_locked(&delayed_refs->lock);
-	elem->seq = delayed_refs->seq;
-	list_add_tail(&elem->list, &delayed_refs->seq_head);
-}
-
-static inline void
-btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
-		      struct seq_list *elem)
-{
-	spin_lock(&delayed_refs->lock);
-	list_del(&elem->list);
-	wake_up(&delayed_refs->seq_wait);
-	spin_unlock(&delayed_refs->lock);
-}
-
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq);
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8cc47103a32e..19a39e10d6f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1944,6 +1944,8 @@ int open_ctree(struct super_block *sb,
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
 
+	init_waitqueue_head(&fs_info->tree_mod_seq_wait);
+
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
 	spin_lock_init(&fs_info->reada_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6e1d36702ff7..94ce79f76e5f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2217,6 +2217,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	struct btrfs_delayed_extent_op *extent_op;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	int count = 0;
 	int must_insert_reserved = 0;
@@ -2255,7 +2256,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref = select_delayed_ref(locked_ref);
 
 		if (ref && ref->seq &&
-		    btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
 			/*
 			 * there are still refs with lower seq numbers in the
 			 * process of being added. Don't run this ref yet.
@@ -2337,7 +2338,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 
 next:
-		do_chunk_alloc(trans, root->fs_info->extent_root,
+		do_chunk_alloc(trans, fs_info->extent_root,
 			       2 * 1024 * 1024,
 			       btrfs_get_alloc_profile(root, 0),
 			       CHUNK_ALLOC_NO_FORCE);
@@ -2347,18 +2348,19 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	return count;
 }
 
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
+			       struct btrfs_delayed_ref_root *delayed_refs,
 			       unsigned long num_refs,
 			       struct list_head *first_seq)
 {
 	spin_unlock(&delayed_refs->lock);
 	pr_debug("waiting for more refs (num %ld, first %p)\n",
 		 num_refs, first_seq);
-	wait_event(delayed_refs->seq_wait,
+	wait_event(fs_info->tree_mod_seq_wait,
 		   num_refs != delayed_refs->num_entries ||
-		   delayed_refs->seq_head.next != first_seq);
+		   fs_info->tree_mod_seq_list.next != first_seq);
 	pr_debug("done waiting for more refs (num %ld, first %p)\n",
-		 delayed_refs->num_entries, delayed_refs->seq_head.next);
+		 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
 	spin_lock(&delayed_refs->lock);
 }
 
@@ -2403,6 +2405,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 again:
 	consider_waiting = 0;
 	spin_lock(&delayed_refs->lock);
+
 	if (count == 0) {
 		count = delayed_refs->num_entries * 2;
 		run_most = 1;
@@ -2437,7 +2440,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 				num_refs = delayed_refs->num_entries;
 				first_seq = root->fs_info->tree_mod_seq_list.next;
 			} else {
-				wait_for_more_refs(delayed_refs,
+				wait_for_more_refs(root->fs_info, delayed_refs,
 						   num_refs, first_seq);
 				/*
 				 * after waiting, things have changed. we
@@ -5190,8 +5193,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	rb_erase(&head->node.rb_node, &delayed_refs->root);
 
 	delayed_refs->num_entries--;
-	if (waitqueue_active(&delayed_refs->seq_wait))
-		wake_up(&delayed_refs->seq_wait);
+	if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
+		wake_up(&root->fs_info->tree_mod_seq_wait);
 
 	/*
 	 * we don't take a ref on the node because we're removing it from the
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b72b068183ec..621c8dc48fb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -38,7 +38,6 @@ void put_transaction(struct btrfs_transaction *transaction)
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
 		WARN_ON(transaction->delayed_refs.root.rb_node);
-		WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
@@ -126,7 +125,6 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
-	cur_trans->delayed_refs.seq = 1;
 
 	/*
 	 * although the tree mod log is per file system and not per transaction,
@@ -145,10 +143,8 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	}
 	atomic_set(&fs_info->tree_mod_seq, 0);
 
-	init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
 	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
-	INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);

From 630dc772ea51bca3ec6fac609f450cbe0cafd1d6 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 11:06:07 +0200
Subject: [PATCH 03/15] Btrfs: qgroup on-disk format

Not all features are in use by the current version
and thus may change in the future.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.h | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f8dc46f44e7..33088b0dbf3f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -883,6 +886,72 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON		(1ULL << 0)
+/*
+ * SCANNING is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_SCANNING	(1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT	(1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION        1
+
+struct btrfs_qgroup_status_item {
+	__le64 version;
+	/*
+	 * the generation is updated during every commit. As older
+	 * versions of btrfs are not aware of qgroups, it will be
+	 * possible to detect inconsistencies by checking the
+	 * generation on mount time
+	 */
+	__le64 generation;
+
+	/* flag definitions see above */
+	__le64 flags;
+
+	/*
+	 * only used during scanning to record the progress
+	 * of the scan. It contains a logical address
+	 */
+	__le64 scan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+	__le64 generation;
+	__le64 rfer;
+	__le64 rfer_cmpr;
+	__le64 excl;
+	__le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER	(1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL	(1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER	(1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL	(1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR	(1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR	(1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+	/*
+	 * only updated when any of the other values change
+	 */
+	__le64 flags;
+	__le64 max_rfer;
+	__le64 max_excl;
+	__le64 rsv_rfer;
+	__le64 rsv_excl;
+} __attribute__ ((__packed__));
+
 struct btrfs_space_info {
 	u64 flags;
 
@@ -1534,6 +1603,30 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
 
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY         240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY           242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY          244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY       246
+
 #define BTRFS_BALANCE_ITEM_KEY	248
 
 /*
@@ -2474,6 +2567,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
 			    sizeof(val));
 }
 
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+		   version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
+		   scan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+		   rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+		   excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+			 struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+			 rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+			 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+			 excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+			 struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+		   max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+		   max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+		   rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+		   rsv_excl, 64);
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;

From 2f38b3e1900634e64a186873b3388b1bf85dabc0 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 11:18:10 +0200
Subject: [PATCH 04/15] Btrfs: add helper for tree enumeration

Often no exact match is wanted but just the next lower or
higher item. There's a lot of duplicated code throughout
btrfs to deal with the corner cases. This patch adds a
helper function that can facilitate searching.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h |  3 ++
 2 files changed, 75 insertions(+)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bef68ab32204..fb21431fe4e0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2788,6 +2788,78 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	return ret;
 }
 
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any)
+{
+	int ret;
+	struct extent_buffer *leaf;
+
+again:
+	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+	if (ret <= 0)
+		return ret;
+	/*
+	 * a return value of 1 means the path is at the position where the
+	 * item should be inserted. Normally this is the next bigger item,
+	 * but in case the previous item is the last in a leaf, path points
+	 * to the first free slot in the previous leaf, i.e. at an invalid
+	 * item.
+	 */
+	leaf = p->nodes[0];
+
+	if (find_higher) {
+		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, p);
+			if (ret <= 0)
+				return ret;
+			if (!return_any)
+				return 1;
+			/*
+			 * no higher item found, return the next
+			 * lower instead
+			 */
+			return_any = 0;
+			find_higher = 0;
+			btrfs_release_path(p);
+			goto again;
+		}
+	} else {
+		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+			/* we're sitting on an invalid slot */
+			if (p->slots[0] == 0) {
+				ret = btrfs_prev_leaf(root, p);
+				if (ret <= 0)
+					return ret;
+				if (!return_any)
+					return 1;
+				/*
+				 * no lower item found, return the next
+				 * higher instead
+				 */
+				return_any = 0;
+				find_higher = 1;
+				btrfs_release_path(p);
+				goto again;
+			}
+			--p->slots[0];
+		}
+	}
+	return 0;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 33088b0dbf3f..27cf995564ed 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2856,6 +2856,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow);
 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 			  struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int start_slot, int cache_only, u64 *last_ret,

From d13603ef6e14a12cd65a6975e8117c0fea7c7ddf Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 11:40:09 +0200
Subject: [PATCH 05/15] Btrfs: check the root passed to btrfs_end_transaction

This patch only add a consistancy check to validate that the
same root is passed to start_transaction and end_transaction.
Subvolume quota depends on this.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/transaction.c | 6 ++++++
 fs/btrfs/transaction.h | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 621c8dc48fb6..23cbda0685b8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -345,6 +345,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->transaction = cur_trans;
 	h->blocks_used = 0;
 	h->bytes_reserved = 0;
+	h->root = root;
 	h->delayed_ref_updates = 0;
 	h->use_count = 1;
 	h->block_rsv = NULL;
@@ -511,6 +512,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
+	/*
+	 * the same root has to be passed to start_transaction and
+	 * end_transaction. Subvolume quota depends on this.
+	 */
+	WARN_ON(trans->root != root);
 	while (count < 2) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fe27379e368b..010729446e13 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,12 @@ struct btrfs_trans_handle {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
 	int aborted;
+	/*
+	 * this root is only needed to validate that the root passed to
+	 * start_transaction is the same as the one passed to end_transaction.
+	 * Subvolume quota depends on this
+	 */
+	struct btrfs_root *root;
 };
 
 struct btrfs_pending_snapshot {

From 20897f5c86b9d2b77baea1d48eda7fa4ac217279 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 12:44:20 +0200
Subject: [PATCH 06/15] Btrfs: added helper to create new trees

This creates a brand new tree. Will be used to create
the quota tree.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c | 78 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/disk-io.h |  6 ++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19a39e10d6f5..6fc243eccffa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1225,6 +1225,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 	return root;
 }
 
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	int ret = 0;
+	u64 bytenr;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, objectid);
+	root->root_key.objectid = objectid;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, objectid, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		goto fail;
+	}
+
+	bytenr = leaf->start;
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, objectid);
+	root->node = leaf;
+
+	write_extent_buffer(leaf, fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	root->commit_root = btrfs_root_node(root);
+	root->track_dirty = 1;
+
+
+	root->root_item.flags = 0;
+	root->root_item.byte_limit = 0;
+	btrfs_set_root_bytenr(&root->root_item, leaf->start);
+	btrfs_set_root_generation(&root->root_item, trans->transid);
+	btrfs_set_root_level(&root->root_item, 0);
+	btrfs_set_root_refs(&root->root_item, 1);
+	btrfs_set_root_used(&root->root_item, leaf->len);
+	btrfs_set_root_last_snapshot(&root->root_item, 0);
+	btrfs_set_root_dirid(&root->root_item, 0);
+	root->root_item.drop_level = 0;
+
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+	if (ret)
+		goto fail;
+
+	btrfs_tree_unlock(leaf);
+
+fail:
+	if (ret)
+		return ERR_PTR(ret);
+
+	return root;
+}
+
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -3260,7 +3336,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-static int btree_lock_page_hook(struct page *page, void *data,
+int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *))
 {
 	struct inode *inode = page->mapping->host;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b3fab39f7e..95e147eea239 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 int btrfs_cleanup_transaction(struct btrfs_root *root);
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
 				  struct btrfs_root *root);
+void btrfs_abort_devices(struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *));
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);

From 416ac51da90e98daaac17e1f359a6c5591f7f5bd Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 12:56:09 +0200
Subject: [PATCH 07/15] Btrfs: qgroup state and initialization

Add state to fs_info.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.h   | 24 ++++++++++++++++++++++++
 fs/btrfs/disk-io.c |  7 +++++++
 2 files changed, 31 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 27cf995564ed..a5269d4a164f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1120,6 +1120,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
 	struct btrfs_root *csum_root;
+	struct btrfs_root *quota_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -1374,6 +1375,29 @@ struct btrfs_fs_info {
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
 #endif
+	/*
+	 * quota information
+	 */
+	unsigned int quota_enabled:1;
+
+	/*
+	 * quota_enabled only changes state after a commit. This holds the
+	 * next state.
+	 */
+	unsigned int pending_quota_state:1;
+
+	/* is qgroup tracking in a consistent state? */
+	u64 qgroup_flags;
+
+	/* holds configuration and tracking. Protected by qgroup_lock */
+	struct rb_root qgroup_tree;
+	spinlock_t qgroup_lock;
+
+	/* list of dirty qgroups to be written at next commit */
+	struct list_head dirty_qgroups;
+
+	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+	u64 qgroup_seq;
 
 	/* filesystem state */
 	u64 fs_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6fc243eccffa..eca054974425 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2110,6 +2110,13 @@ int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
 
+	spin_lock_init(&fs_info->qgroup_lock);
+	fs_info->qgroup_tree = RB_ROOT;
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	fs_info->qgroup_seq = 1;
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
 

From 709c0486b9fe9586736b108b7233bbce0300cfa5 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 12 Sep 2011 12:22:57 +0200
Subject: [PATCH 08/15] Btrfs: Test code to change the order of delayed-ref
 processing

Normally delayed refs get processed in ascending bytenr order. This
correlates in most cases to the order added. To expose dependencies
on this order, we start to process the tree in the middle instead of
the beginning.
This code is only effective when SCRAMBLE_DELAYED_REFS is defined.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/extent-tree.c | 49 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 94ce79f76e5f..b13f1fbc3733 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
 #include "locking.h"
 #include "free-space-cache.h"
 
+#undef SCRAMBLE_DELAYED_REFS
+
 /*
  * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
@@ -2364,6 +2366,49 @@ static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
 	spin_lock(&delayed_refs->lock);
 }
 
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int alt = 1;
+	u64 middle;
+	u64 first = 0, last = 0;
+
+	n = rb_first(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		first = entry->bytenr;
+	}
+	n = rb_last(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		last = entry->bytenr;
+	}
+	n = root->rb_node;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		middle = entry->bytenr;
+
+		if (alt)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+
+		alt = 1 - alt;
+	}
+	return middle;
+}
+#endif
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2406,6 +2451,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	consider_waiting = 0;
 	spin_lock(&delayed_refs->lock);
 
+#ifdef SCRAMBLE_DELAYED_REFS
+	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
 	if (count == 0) {
 		count = delayed_refs->num_entries * 2;
 		run_most = 1;

From bed92eae26ccf280d1a2168b7509447b56675a27 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Thu, 28 Jun 2012 18:03:02 +0200
Subject: [PATCH 09/15] Btrfs: qgroup implementation and prototypes

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/Makefile      |    2 +-
 fs/btrfs/ctree.h       |   46 ++
 fs/btrfs/extent-tree.c |   34 +
 fs/btrfs/ioctl.h       |   24 +
 fs/btrfs/qgroup.c      | 1571 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.c |    2 +
 fs/btrfs/transaction.h |    3 +
 7 files changed, 1681 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/qgroup.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2befae7..0bc4d3a10a5f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o ulist.o
+	   reada.o backref.o ulist.o qgroup.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a5269d4a164f..ccba9b684c96 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2830,6 +2830,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -3339,6 +3341,50 @@ void btrfs_reada_detach(void *handle);
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 			 u64 start, int err);
 
+/* qgroup.c */
+struct qgroup_update {
+	struct list_head list;
+	struct btrfs_delayed_ref_node *node;
+	struct btrfs_delayed_extent_op *extent_op;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid,
+			char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_delayed_ref_node *node,
+			    struct btrfs_delayed_extent_op *extent_op);
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info,
+			     struct btrfs_delayed_ref_node *node,
+			     struct btrfs_delayed_extent_op *extent_op);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b13f1fbc3733..1a63b830846d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2409,6 +2409,40 @@ static u64 find_middle(struct rb_root *root)
 }
 #endif
 
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct qgroup_update *qgroup_update;
+	int ret = 0;
+
+	if (list_empty(&trans->qgroup_ref_list) !=
+	    !trans->delayed_ref_elem.seq) {
+		/* list without seq or seq without list */
+		printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+			list_empty(&trans->qgroup_ref_list) ? "" : " not",
+			trans->delayed_ref_elem.seq);
+		BUG();
+	}
+
+	if (!trans->delayed_ref_elem.seq)
+		return 0;
+
+	while (!list_empty(&trans->qgroup_ref_list)) {
+		qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+						 struct qgroup_update, list);
+		list_del(&qgroup_update->list);
+		if (!ret)
+			ret = btrfs_qgroup_account_ref(
+					trans, fs_info, qgroup_update->node,
+					qgroup_update->extent_op);
+		kfree(qgroup_update);
+	}
+
+	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+	return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e440aa653c30..a8a2230f4c5c 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -35,6 +35,30 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_UUID_SIZE 16
 
+#define BTRFS_QGROUP_INHERIT_SET_LIMITS	(1ULL << 0)
+
+struct btrfs_qgroup_limit {
+	__u64	flags;
+	__u64	max_rfer;
+	__u64	max_excl;
+	__u64	rsv_rfer;
+	__u64	rsv_excl;
+};
+
+struct btrfs_qgroup_inherit {
+	__u64	flags;
+	__u64	num_qgroups;
+	__u64	num_ref_copies;
+	__u64	num_excl_copies;
+	struct btrfs_qgroup_limit lim;
+	__u64	qgroups[0];
+};
+
+struct btrfs_ioctl_qgroup_limit_args {
+	__u64	qgroupid;
+	struct btrfs_qgroup_limit lim;
+};
+
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
 	__s64 fd;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 000000000000..bc424ae5a81a
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,1571 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "ioctl.h"
+#include "backref.h"
+
+/* TODO XXX FIXME
+ *  - subvol delete -> delete when ref goes to 0? delete limits also?
+ *  - reorganize keys
+ *  - compressed
+ *  - sync
+ *  - rescan
+ *  - copy also limits on subvol creation
+ *  - limit
+ *  - caches fuer ulists
+ *  - performance benchmarks
+ *  - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+	u64 qgroupid;
+
+	/*
+	 * state
+	 */
+	u64 rfer;	/* referenced */
+	u64 rfer_cmpr;	/* referenced compressed */
+	u64 excl;	/* exclusive */
+	u64 excl_cmpr;	/* exclusive compressed */
+
+	/*
+	 * limits
+	 */
+	u64 lim_flags;	/* which limits are set */
+	u64 max_rfer;
+	u64 max_excl;
+	u64 rsv_rfer;
+	u64 rsv_excl;
+
+	/*
+	 * reservation tracking
+	 */
+	u64 reserved;
+
+	/*
+	 * lists
+	 */
+	struct list_head groups;  /* groups this group is member of */
+	struct list_head members; /* groups that are members of this group */
+	struct list_head dirty;   /* dirty groups */
+	struct rb_node node;	  /* tree of qgroups */
+
+	/*
+	 * temp variables for accounting operations
+	 */
+	u64 tag;
+	u64 refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+	struct list_head next_group;
+	struct list_head next_member;
+	struct btrfs_qgroup *group;
+	struct btrfs_qgroup *member;
+};
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+					   u64 qgroupid)
+{
+	struct rb_node *n = fs_info->qgroup_tree.rb_node;
+	struct btrfs_qgroup *qgroup;
+
+	while (n) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		if (qgroup->qgroupid < qgroupid)
+			n = n->rb_left;
+		else if (qgroup->qgroupid > qgroupid)
+			n = n->rb_right;
+		else
+			return qgroup;
+	}
+	return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+					  u64 qgroupid)
+{
+	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_qgroup *qgroup;
+
+	while (*p) {
+		parent = *p;
+		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+		if (qgroup->qgroupid < qgroupid)
+			p = &(*p)->rb_left;
+		else if (qgroup->qgroupid > qgroupid)
+			p = &(*p)->rb_right;
+		else
+			return qgroup;
+	}
+
+	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+	if (!qgroup)
+		return ERR_PTR(-ENOMEM);
+
+	qgroup->qgroupid = qgroupid;
+	INIT_LIST_HEAD(&qgroup->groups);
+	INIT_LIST_HEAD(&qgroup->members);
+	INIT_LIST_HEAD(&qgroup->dirty);
+
+	rb_link_node(&qgroup->node, parent, p);
+	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+	return qgroup;
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+	struct btrfs_qgroup_list *list;
+
+	if (!qgroup)
+		return -ENOENT;
+
+	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+	list_del(&qgroup->dirty);
+
+	while (!list_empty(&qgroup->groups)) {
+		list = list_first_entry(&qgroup->groups,
+					struct btrfs_qgroup_list, next_group);
+		list_del(&list->next_group);
+		list_del(&list->next_member);
+		kfree(list);
+	}
+
+	while (!list_empty(&qgroup->members)) {
+		list = list_first_entry(&qgroup->members,
+					struct btrfs_qgroup_list, next_member);
+		list_del(&list->next_group);
+		list_del(&list->next_member);
+		kfree(list);
+	}
+	kfree(qgroup);
+
+	return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+			   u64 memberid, u64 parentid)
+{
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup_list *list;
+
+	member = find_qgroup_rb(fs_info, memberid);
+	parent = find_qgroup_rb(fs_info, parentid);
+	if (!member || !parent)
+		return -ENOENT;
+
+	list = kzalloc(sizeof(*list), GFP_ATOMIC);
+	if (!list)
+		return -ENOMEM;
+
+	list->group = parent;
+	list->member = member;
+	list_add_tail(&list->next_group, &member->groups);
+	list_add_tail(&list->next_member, &parent->members);
+
+	return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+			   u64 memberid, u64 parentid)
+{
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup_list *list;
+
+	member = find_qgroup_rb(fs_info, memberid);
+	parent = find_qgroup_rb(fs_info, parentid);
+	if (!member || !parent)
+		return -ENOENT;
+
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent) {
+			list_del(&list->next_group);
+			list_del(&list->next_member);
+			kfree(list);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *l;
+	int slot;
+	int ret = 0;
+	u64 flags = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* default this to quota off, in case no status key is found */
+	fs_info->qgroup_flags = 0;
+
+	/*
+	 * pass 1: read status, all qgroup infos and limits
+	 */
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+	if (ret)
+		goto out;
+
+	while (1) {
+		struct btrfs_qgroup *qgroup;
+
+		slot = path->slots[0];
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+			struct btrfs_qgroup_status_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_status_item);
+
+			if (btrfs_qgroup_status_version(l, ptr) !=
+			    BTRFS_QGROUP_STATUS_VERSION) {
+				printk(KERN_ERR
+				 "btrfs: old qgroup version, quota disabled\n");
+				goto out;
+			}
+			if (btrfs_qgroup_status_generation(l, ptr) !=
+			    fs_info->generation) {
+				flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+				printk(KERN_ERR
+					"btrfs: qgroup generation mismatch, "
+					"marked as inconsistent\n");
+			}
+			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+									  ptr);
+			/* FIXME read scan element */
+			goto next1;
+		}
+
+		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+			goto next1;
+
+		qgroup = find_qgroup_rb(fs_info, found_key.offset);
+		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+			printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+			flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		}
+		if (!qgroup) {
+			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			if (IS_ERR(qgroup)) {
+				ret = PTR_ERR(qgroup);
+				goto out;
+			}
+		}
+		switch (found_key.type) {
+		case BTRFS_QGROUP_INFO_KEY: {
+			struct btrfs_qgroup_info_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_info_item);
+			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+			/* generation currently unused */
+			break;
+		}
+		case BTRFS_QGROUP_LIMIT_KEY: {
+			struct btrfs_qgroup_limit_item *ptr;
+
+			ptr = btrfs_item_ptr(l, slot,
+					     struct btrfs_qgroup_limit_item);
+			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+			break;
+		}
+		}
+next1:
+		ret = btrfs_next_item(quota_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+	}
+	btrfs_release_path(path);
+
+	/*
+	 * pass 2: read all qgroup relations
+	 */
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = 0;
+	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+	if (ret)
+		goto out;
+	while (1) {
+		slot = path->slots[0];
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+			goto next2;
+
+		if (found_key.objectid > found_key.offset) {
+			/* parent <- member, not needed to build config */
+			/* FIXME should we omit the key completely? */
+			goto next2;
+		}
+
+		ret = add_relation_rb(fs_info, found_key.objectid,
+				      found_key.offset);
+		if (ret)
+			goto out;
+next2:
+		ret = btrfs_next_item(quota_root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+	}
+out:
+	fs_info->qgroup_flags |= flags;
+	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+		fs_info->quota_enabled = 0;
+		fs_info->pending_quota_state = 0;
+	}
+	btrfs_free_path(path);
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is only called from close_ctree() or open_ctree(), both in single-
+ * treaded paths. Clean up the in-memory structures. No locking needed.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup_list *list;
+
+	while ((n = rb_first(&fs_info->qgroup_tree))) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		rb_erase(n, &fs_info->qgroup_tree);
+
+		WARN_ON(!list_empty(&qgroup->dirty));
+
+		while (!list_empty(&qgroup->groups)) {
+			list = list_first_entry(&qgroup->groups,
+						struct btrfs_qgroup_list,
+						next_group);
+			list_del(&list->next_group);
+			list_del(&list->next_member);
+			kfree(list);
+		}
+
+		while (!list_empty(&qgroup->members)) {
+			list = list_first_entry(&qgroup->members,
+						struct btrfs_qgroup_list,
+						next_member);
+			list_del(&list->next_group);
+			list_del(&list->next_member);
+			kfree(list);
+		}
+		kfree(qgroup);
+	}
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *quota_root,
+				    u64 src, u64 dst)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = src;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = dst;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *quota_root,
+				    u64 src, u64 dst)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = src;
+	key.type = BTRFS_QGROUP_RELATION_KEY;
+	key.offset = dst;
+
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *quota_root, u64 qgroupid)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_qgroup_info_item *qgroup_info;
+	struct btrfs_qgroup_limit_item *qgroup_limit;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroupid;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*qgroup_info));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+				 struct btrfs_qgroup_info_item);
+	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_release_path(path);
+
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*qgroup_limit));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+				  struct btrfs_qgroup_limit_item);
+	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *quota_root, u64 qgroupid)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroupid;
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root, u64 qgroupid,
+				    u64 flags, u64 max_rfer, u64 max_excl,
+				    u64 rsv_rfer, u64 rsv_excl)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_limit_item *qgroup_limit;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_LIMIT_KEY;
+	key.offset = qgroupid;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	qgroup_limit = btrfs_item_ptr(l, path->slots[0],
+				      struct btrfs_qgroup_limit_item);
+	btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
+	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
+	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
+	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
+	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_qgroup *qgroup)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_info_item *qgroup_info;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_INFO_KEY;
+	key.offset = qgroup->qgroupid;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	qgroup_info = btrfs_item_ptr(l, path->slots[0],
+				 struct btrfs_qgroup_info_item);
+	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				    struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_qgroup_status_item *ptr;
+	int ret;
+	int slot;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_STATUS_KEY;
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+
+	if (ret)
+		goto out;
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+	/* XXX scan */
+
+	btrfs_mark_buffer_dirty(l);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	if (!root)
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		key.objectid = 0;
+		key.offset = 0;
+		key.type = 0;
+
+		path->leave_spinning = 1;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		} else if (ret < 0) {
+			break;
+		}
+
+		ret = btrfs_del_item(trans, root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+	ret = 0;
+out:
+	root->fs_info->pending_quota_state = 0;
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_path *path = NULL;
+	struct btrfs_qgroup_status_item *ptr;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret = 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	if (fs_info->quota_root) {
+		fs_info->pending_quota_state = 1;
+		spin_unlock(&fs_info->qgroup_lock);
+		goto out;
+	}
+	spin_unlock(&fs_info->qgroup_lock);
+
+	/*
+	 * initially create the quota tree
+	 */
+	quota_root = btrfs_create_tree(trans, fs_info,
+				       BTRFS_QUOTA_TREE_OBJECTID);
+	if (IS_ERR(quota_root)) {
+		ret =  PTR_ERR(quota_root);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = 0;
+	key.type = BTRFS_QGROUP_STATUS_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+				      sizeof(*ptr));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	ptr = btrfs_item_ptr(leaf, path->slots[0],
+				 struct btrfs_qgroup_status_item);
+	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_scan(leaf, ptr, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_root = quota_root;
+	fs_info->pending_quota_state = 1;
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *quota_root;
+	int ret = 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	quota_root = fs_info->quota_root;
+	fs_info->quota_root = NULL;
+	btrfs_free_qgroup_config(fs_info);
+	spin_unlock(&fs_info->qgroup_lock);
+
+	if (!quota_root)
+		return -EINVAL;
+
+	ret = btrfs_clean_quota_tree(trans, quota_root);
+	if (ret)
+		goto out;
+
+	ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+	if (ret)
+		goto out;
+
+	list_del(&quota_root->dirty_list);
+
+	btrfs_tree_lock(quota_root->node);
+	clean_tree_block(trans, tree_root, quota_root->node);
+	btrfs_tree_unlock(quota_root->node);
+	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+	free_extent_buffer(quota_root->node);
+	free_extent_buffer(quota_root->commit_root);
+	kfree(quota_root);
+out:
+	return ret;
+}
+
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
+{
+	/* FIXME */
+	return 0;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	struct btrfs_root *quota_root;
+	int ret = 0;
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		return -EINVAL;
+
+	ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+	if (ret)
+		return ret;
+
+	ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+	if (ret) {
+		del_qgroup_relation_item(trans, quota_root, src, dst);
+		return ret;
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+	ret = add_relation_rb(quota_root->fs_info, src, dst);
+	spin_unlock(&fs_info->qgroup_lock);
+
+	return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+	struct btrfs_root *quota_root;
+	int ret = 0;
+	int err;
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		return -EINVAL;
+
+	ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+	err = del_qgroup_relation_item(trans, quota_root, dst, src);
+	if (err && !ret)
+		ret = err;
+
+	spin_lock(&fs_info->qgroup_lock);
+	del_relation_rb(fs_info, src, dst);
+
+	spin_unlock(&fs_info->qgroup_lock);
+
+	return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		return -EINVAL;
+
+	ret = add_qgroup_item(trans, quota_root, qgroupid);
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = add_qgroup_rb(fs_info, qgroupid);
+	spin_unlock(&fs_info->qgroup_lock);
+
+	if (IS_ERR(qgroup))
+		ret = PTR_ERR(qgroup);
+
+	return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_root *quota_root;
+	int ret = 0;
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		return -EINVAL;
+
+	ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+	spin_lock(&fs_info->qgroup_lock);
+	del_qgroup_rb(quota_root->fs_info, qgroupid);
+
+	spin_unlock(&fs_info->qgroup_lock);
+
+	return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit)
+{
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_qgroup *qgroup;
+	int ret = 0;
+
+	if (!quota_root)
+		return -EINVAL;
+
+	ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
+				       limit->flags, limit->max_rfer,
+				       limit->max_excl, limit->rsv_rfer,
+				       limit->rsv_excl);
+	if (ret) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		printk(KERN_INFO "unable to update quota limit for %llu\n",
+		       (unsigned long long)qgroupid);
+	}
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+	qgroup->lim_flags = limit->flags;
+	qgroup->max_rfer = limit->max_rfer;
+	qgroup->max_excl = limit->max_excl;
+	qgroup->rsv_rfer = limit->rsv_rfer;
+	qgroup->rsv_excl = limit->rsv_excl;
+
+unlock:
+	spin_unlock(&fs_info->qgroup_lock);
+
+	return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+			 struct btrfs_qgroup *qgroup)
+{
+	if (list_empty(&qgroup->dirty))
+		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
+ * the modification into a list that's later used by btrfs_end_transaction to
+ * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_delayed_ref_node *node,
+			    struct btrfs_delayed_extent_op *extent_op)
+{
+	struct qgroup_update *u;
+
+	BUG_ON(!trans->delayed_ref_elem.seq);
+	u = kmalloc(sizeof(*u), GFP_NOFS);
+	if (!u)
+		return -ENOMEM;
+
+	u->node = node;
+	u->extent_op = extent_op;
+	list_add_tail(&u->list, &trans->qgroup_ref_list);
+
+	return 0;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info,
+			     struct btrfs_delayed_ref_node *node,
+			     struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_key ins;
+	struct btrfs_root *quota_root;
+	u64 ref_root;
+	struct btrfs_qgroup *qgroup;
+	struct ulist_node *unode;
+	struct ulist *roots = NULL;
+	struct ulist *tmp = NULL;
+	struct ulist_iterator uiter;
+	u64 seq;
+	int ret = 0;
+	int sgn;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	BUG_ON(!fs_info->quota_root);
+
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		struct btrfs_delayed_tree_ref *ref;
+		ref = btrfs_delayed_node_to_tree_ref(node);
+		ref_root = ref->root;
+	} else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		   node->type == BTRFS_SHARED_DATA_REF_KEY) {
+		struct btrfs_delayed_data_ref *ref;
+		ref = btrfs_delayed_node_to_data_ref(node);
+		ref_root = ref->root;
+	} else {
+		BUG();
+	}
+
+	if (!is_fstree(ref_root)) {
+		/*
+		 * non-fs-trees are not being accounted
+		 */
+		return 0;
+	}
+
+	switch (node->action) {
+	case BTRFS_ADD_DELAYED_REF:
+	case BTRFS_ADD_DELAYED_EXTENT:
+		sgn = 1;
+		break;
+	case BTRFS_DROP_DELAYED_REF:
+		sgn = -1;
+		break;
+	case BTRFS_UPDATE_DELAYED_HEAD:
+		return 0;
+	default:
+		BUG();
+	}
+
+	/*
+	 * the delayed ref sequence number we pass depends on the direction of
+	 * the operation. for add operations, we pass (node->seq - 1) to skip
+	 * the delayed ref's current sequence number, because we need the state
+	 * of the tree before the add operation. for delete operations, we pass
+	 * (node->seq) to include the delayed ref's current sequence number,
+	 * because we need the state of the tree after the delete operation.
+	 */
+	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
+				   sgn > 0 ? node->seq - 1 : node->seq, &roots);
+	if (ret < 0)
+		goto out;
+
+	spin_lock(&fs_info->qgroup_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto unlock;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto unlock;
+
+	/*
+	 * step 1: for each old ref, visit all nodes once and inc refcnt
+	 */
+	tmp = ulist_alloc(GFP_ATOMIC);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+	seq = fs_info->qgroup_seq;
+	fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		struct ulist_node *tmp_unode;
+		struct ulist_iterator tmp_uiter;
+		struct btrfs_qgroup *qg;
+
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+
+		ulist_reinit(tmp);
+						/* XXX id not needed */
+		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = (struct btrfs_qgroup *)tmp_unode->aux;
+			if (qg->refcnt < seq)
+				qg->refcnt = seq + 1;
+			else
+				++qg->refcnt;
+
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ulist_add(tmp, glist->group->qgroupid,
+					  (unsigned long)glist->group,
+					  GFP_ATOMIC);
+			}
+		}
+	}
+
+	/*
+	 * step 2: walk from the new root
+	 */
+	ulist_reinit(tmp);
+	ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = (struct btrfs_qgroup *)unode->aux;
+		if (qg->refcnt < seq) {
+			/* not visited by step 1 */
+			qg->rfer += sgn * node->num_bytes;
+			qg->rfer_cmpr += sgn * node->num_bytes;
+			if (roots->nnodes == 0) {
+				qg->excl += sgn * node->num_bytes;
+				qg->excl_cmpr += sgn * node->num_bytes;
+			}
+			qgroup_dirty(fs_info, qg);
+		}
+		WARN_ON(qg->tag >= seq);
+		qg->tag = seq;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ulist_add(tmp, glist->group->qgroupid,
+				  (unsigned long)glist->group, GFP_ATOMIC);
+		}
+	}
+
+	/*
+	 * step 3: walk again from old refs
+	 */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct ulist_node *tmp_unode;
+		struct ulist_iterator tmp_uiter;
+
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+
+		ulist_reinit(tmp);
+		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = (struct btrfs_qgroup *)tmp_unode->aux;
+			if (qg->tag == seq)
+				continue;
+
+			if (qg->refcnt - seq == roots->nnodes) {
+				qg->excl -= sgn * node->num_bytes;
+				qg->excl_cmpr -= sgn * node->num_bytes;
+				qgroup_dirty(fs_info, qg);
+			}
+
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ulist_add(tmp, glist->group->qgroupid,
+					  (unsigned long)glist->group,
+					  GFP_ATOMIC);
+			}
+		}
+	}
+	ret = 0;
+unlock:
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	ulist_free(roots);
+	ulist_free(tmp);
+
+	return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	int ret = 0;
+
+	if (!quota_root)
+		goto out;
+
+	fs_info->quota_enabled = fs_info->pending_quota_state;
+
+	spin_lock(&fs_info->qgroup_lock);
+	while (!list_empty(&fs_info->dirty_qgroups)) {
+		struct btrfs_qgroup *qgroup;
+		qgroup = list_first_entry(&fs_info->dirty_qgroups,
+					  struct btrfs_qgroup, dirty);
+		list_del_init(&qgroup->dirty);
+		spin_unlock(&fs_info->qgroup_lock);
+		ret = update_qgroup_info_item(trans, quota_root, qgroup);
+		if (ret)
+			fs_info->qgroup_flags |=
+					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		spin_lock(&fs_info->qgroup_lock);
+	}
+	if (fs_info->quota_enabled)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+	else
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+	spin_unlock(&fs_info->qgroup_lock);
+
+	ret = update_qgroup_status_item(trans, fs_info, quota_root);
+	if (ret)
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+out:
+
+	return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit)
+{
+	int ret = 0;
+	int i;
+	u64 *i_qgroups;
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_qgroup *srcgroup;
+	struct btrfs_qgroup *dstgroup;
+	u32 level_size = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	if (!quota_root)
+		return -EINVAL;
+
+	/*
+	 * create a tracking group for the subvol itself
+	 */
+	ret = add_qgroup_item(trans, quota_root, objectid);
+	if (ret)
+		goto out;
+
+	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+		ret = update_qgroup_limit_item(trans, quota_root, objectid,
+					       inherit->lim.flags,
+					       inherit->lim.max_rfer,
+					       inherit->lim.max_excl,
+					       inherit->lim.rsv_rfer,
+					       inherit->lim.rsv_excl);
+		if (ret)
+			goto out;
+	}
+
+	if (srcid) {
+		struct btrfs_root *srcroot;
+		struct btrfs_key srckey;
+		int srcroot_level;
+
+		srckey.objectid = srcid;
+		srckey.type = BTRFS_ROOT_ITEM_KEY;
+		srckey.offset = (u64)-1;
+		srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+		if (IS_ERR(srcroot)) {
+			ret = PTR_ERR(srcroot);
+			goto out;
+		}
+
+		rcu_read_lock();
+		srcroot_level = btrfs_header_level(srcroot->node);
+		level_size = btrfs_level_size(srcroot, srcroot_level);
+		rcu_read_unlock();
+	}
+
+	/*
+	 * add qgroup to all inherited groups
+	 */
+	if (inherit) {
+		i_qgroups = (u64 *)(inherit + 1);
+		for (i = 0; i < inherit->num_qgroups; ++i) {
+			ret = add_qgroup_relation_item(trans, quota_root,
+						       objectid, *i_qgroups);
+			if (ret)
+				goto out;
+			ret = add_qgroup_relation_item(trans, quota_root,
+						       *i_qgroups, objectid);
+			if (ret)
+				goto out;
+			++i_qgroups;
+		}
+	}
+
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	dstgroup = add_qgroup_rb(fs_info, objectid);
+	if (!dstgroup)
+		goto unlock;
+
+	if (srcid) {
+		srcgroup = find_qgroup_rb(fs_info, srcid);
+		if (!srcgroup)
+			goto unlock;
+		dstgroup->rfer = srcgroup->rfer - level_size;
+		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+		srcgroup->excl = level_size;
+		srcgroup->excl_cmpr = level_size;
+		qgroup_dirty(fs_info, dstgroup);
+		qgroup_dirty(fs_info, srcgroup);
+	}
+
+	if (!inherit)
+		goto unlock;
+
+	i_qgroups = (u64 *)(inherit + 1);
+	for (i = 0; i < inherit->num_qgroups; ++i) {
+		ret = add_relation_rb(quota_root->fs_info, objectid,
+				      *i_qgroups);
+		if (ret)
+			goto unlock;
+		++i_qgroups;
+	}
+
+	for (i = 0; i <  inherit->num_ref_copies; ++i) {
+		struct btrfs_qgroup *src;
+		struct btrfs_qgroup *dst;
+
+		src = find_qgroup_rb(fs_info, i_qgroups[0]);
+		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+		if (!src || !dst) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		dst->rfer = src->rfer - level_size;
+		dst->rfer_cmpr = src->rfer_cmpr - level_size;
+		i_qgroups += 2;
+	}
+	for (i = 0; i <  inherit->num_excl_copies; ++i) {
+		struct btrfs_qgroup *src;
+		struct btrfs_qgroup *dst;
+
+		src = find_qgroup_rb(fs_info, i_qgroups[0]);
+		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+		if (!src || !dst) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		dst->excl = src->excl + level_size;
+		dst->excl_cmpr = src->excl_cmpr + level_size;
+		i_qgroups += 2;
+	}
+
+unlock:
+	spin_unlock(&fs_info->qgroup_lock);
+out:
+	return ret;
+}
+
+/*
+ * reserve some space for a qgroup and all its parents. The reservation takes
+ * place with start_transaction or dealloc_reserve, similar to ENOSPC
+ * accounting. If not enough space is available, EDQUOT is returned.
+ * We assume that the requested space is new for all qgroups.
+ */
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 ref_root = root->root_key.objectid;
+	int ret = 0;
+	struct ulist *ulist = NULL;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+
+	if (!is_fstree(ref_root))
+		return 0;
+
+	if (num_bytes == 0)
+		return 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	/*
+	 * in a first step, we check all affected qgroups if any limits would
+	 * be exceeded
+	 */
+	ulist = ulist_alloc(GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = (struct btrfs_qgroup *)unode->aux;
+
+		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+		    qg->reserved + qg->rfer + num_bytes >
+		    qg->max_rfer)
+			ret = -EDQUOT;
+
+		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+		    qg->reserved + qg->excl + num_bytes >
+		    qg->max_excl)
+			ret = -EDQUOT;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ulist_add(ulist, glist->group->qgroupid,
+				  (unsigned long)glist->group, GFP_ATOMIC);
+		}
+	}
+	if (ret)
+		goto out;
+
+	/*
+	 * no limits exceeded, now record the reservation into all qgroups
+	 */
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+
+		qg = (struct btrfs_qgroup *)unode->aux;
+
+		qg->reserved += num_bytes;
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	ulist_free(ulist);
+
+	return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist *ulist = NULL;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	u64 ref_root = root->root_key.objectid;
+
+	if (!is_fstree(ref_root))
+		return;
+
+	if (num_bytes == 0)
+		return;
+
+	spin_lock(&fs_info->qgroup_lock);
+
+	quota_root = fs_info->quota_root;
+	if (!quota_root)
+		goto out;
+
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+
+	ulist = ulist_alloc(GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = (struct btrfs_qgroup *)unode->aux;
+
+		qg->reserved -= num_bytes;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ulist_add(ulist, glist->group->qgroupid,
+				  (unsigned long)glist->group, GFP_ATOMIC);
+		}
+	}
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	ulist_free(ulist);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+		return;
+	printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+		trans->delayed_ref_elem.seq);
+	BUG();
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 23cbda0685b8..0d6c8816845a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -351,6 +351,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->block_rsv = NULL;
 	h->orig_rsv = NULL;
 	h->aborted = 0;
+	h->delayed_ref_elem.seq = 0;
+	INIT_LIST_HEAD(&h->qgroup_ref_list);
 
 	smp_mb();
 	if (cur_trans->blocked && may_wait_transaction(root, type)) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 010729446e13..16ba00842c38 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -20,6 +20,7 @@
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "ctree.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -63,6 +64,8 @@ struct btrfs_trans_handle {
 	 * Subvolume quota depends on this
 	 */
 	struct btrfs_root *root;
+	struct seq_list delayed_ref_elem;
+	struct list_head qgroup_ref_list;
 };
 
 struct btrfs_pending_snapshot {

From edf39272db4810282360f7362d43ade1d524c913 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 28 Jun 2012 18:04:55 +0200
Subject: [PATCH 10/15] Btrfs: call the qgroup accounting functions

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent-tree.c |  3 +++
 fs/btrfs/transaction.c | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1a63b830846d..c08337a83ace 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2479,6 +2479,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
 		       CHUNK_ALLOC_NO_FORCE);
 
+	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
 again:
@@ -2588,6 +2590,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	}
 out:
 	spin_unlock(&delayed_refs->lock);
+	assert_qgroups_uptodate(trans);
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0d6c8816845a..d20d2e24f8d2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -512,6 +512,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
+	/*
+	 * do the qgroup accounting as early as possible
+	 */
+	err = btrfs_delayed_refs_qgroup_accounting(trans, info);
+
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 	/*
@@ -571,6 +576,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
 		err = -EIO;
 	}
+	assert_qgroups_uptodate(trans);
 
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -1355,6 +1361,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		if (ret)
 			goto cleanup_transaction;
 
+		/*
+		 * running the delayed items may have added new refs. account
+		 * them now so that they hinder processing of more delayed refs
+		 * as little as possible.
+		 */
+		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
 		/*
 		 * rename don't use btrfs_join_transaction, so, once we
 		 * set the transaction to blocked above, we aren't going
@@ -1467,6 +1480,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			    root->fs_info->chunk_root->node);
 	switch_commit_root(root->fs_info->chunk_root);
 
+	assert_qgroups_uptodate(trans);
 	update_super_roots(root);
 
 	if (!root->fs_info->log_root_recovering) {

From bcef60f249034f69e89e544461cbfecb68975595 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Tue, 13 Sep 2011 15:23:30 +0200
Subject: [PATCH 11/15] Btrfs: quota tree support and startup

Init the quota tree along with the others on open_ctree
and close_ctree. Add the quota tree to the list of well
known trees in btrfs_read_fs_root_no_name.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 47 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ccba9b684c96..2ba03b96fbe0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2967,6 +2967,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
 	kfree(fs_info->csum_root);
+	kfree(fs_info->quota_root);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
 	kfree(fs_info);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eca054974425..87d9391c0576 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1472,6 +1472,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->dev_root;
 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
 		return fs_info->csum_root;
+	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+		return fs_info->quota_root ? fs_info->quota_root :
+					     ERR_PTR(-ENOENT);
 again:
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1899,6 +1902,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 	free_extent_buffer(info->extent_root->commit_root);
 	free_extent_buffer(info->csum_root->node);
 	free_extent_buffer(info->csum_root->commit_root);
+	if (info->quota_root) {
+		free_extent_buffer(info->quota_root->node);
+		free_extent_buffer(info->quota_root->commit_root);
+	}
 
 	info->tree_root->node = NULL;
 	info->tree_root->commit_root = NULL;
@@ -1908,6 +1915,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 	info->extent_root->commit_root = NULL;
 	info->csum_root->node = NULL;
 	info->csum_root->commit_root = NULL;
+	if (info->quota_root) {
+		info->quota_root->node = NULL;
+		info->quota_root->commit_root = NULL;
+	}
 
 	if (chunk_root) {
 		free_extent_buffer(info->chunk_root->node);
@@ -1938,6 +1949,7 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
+	struct btrfs_root *quota_root;
 	struct btrfs_root *log_tree_root;
 	int ret;
 	int err = -EINVAL;
@@ -1949,9 +1961,10 @@ int open_ctree(struct super_block *sb,
 	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
 	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+	quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
 
 	if (!tree_root || !extent_root || !csum_root ||
-	    !chunk_root || !dev_root) {
+	    !chunk_root || !dev_root || !quota_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -2441,6 +2454,17 @@ int open_ctree(struct super_block *sb,
 		goto recovery_tree_root;
 	csum_root->track_dirty = 1;
 
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_QUOTA_TREE_OBJECTID, quota_root);
+	if (ret) {
+		kfree(quota_root);
+		quota_root = fs_info->quota_root = NULL;
+	} else {
+		quota_root->track_dirty = 1;
+		fs_info->quota_enabled = 1;
+		fs_info->pending_quota_state = 1;
+	}
+
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 
@@ -2500,6 +2524,9 @@ int open_ctree(struct super_block *sb,
 			       " integrity check module %s\n", sb->s_id);
 	}
 #endif
+	ret = btrfs_read_qgroup_config(fs_info);
+	if (ret)
+		goto fail_trans_kthread;
 
 	/* do not make disk changes in broken FS */
 	if (btrfs_super_log_root(disk_super) != 0 &&
@@ -2510,7 +2537,7 @@ int open_ctree(struct super_block *sb,
 			printk(KERN_WARNING "Btrfs log replay required "
 			       "on RO media\n");
 			err = -EIO;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 		blocksize =
 		     btrfs_level_size(tree_root,
@@ -2519,7 +2546,7 @@ int open_ctree(struct super_block *sb,
 		log_tree_root = btrfs_alloc_root(fs_info);
 		if (!log_tree_root) {
 			err = -ENOMEM;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2559,7 +2586,7 @@ int open_ctree(struct super_block *sb,
 			printk(KERN_WARNING
 			       "btrfs: failed to recover relocation\n");
 			err = -EINVAL;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 	}
 
@@ -2569,10 +2596,10 @@ int open_ctree(struct super_block *sb,
 
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
-		goto fail_trans_kthread;
+		goto fail_qgroup;
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
-		goto fail_trans_kthread;
+		goto fail_qgroup;
 	}
 
 	if (sb->s_flags & MS_RDONLY)
@@ -2596,6 +2623,8 @@ int open_ctree(struct super_block *sb,
 
 	return 0;
 
+fail_qgroup:
+	btrfs_free_qgroup_config(fs_info);
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
 fail_cleaner:
@@ -3194,6 +3223,8 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 2;
 	smp_mb();
 
+	btrfs_free_qgroup_config(root->fs_info);
+
 	if (fs_info->delalloc_bytes) {
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       (unsigned long long)fs_info->delalloc_bytes);
@@ -3213,6 +3244,10 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(fs_info->dev_root->commit_root);
 	free_extent_buffer(fs_info->csum_root->node);
 	free_extent_buffer(fs_info->csum_root->commit_root);
+	if (fs_info->quota_root) {
+		free_extent_buffer(fs_info->quota_root->node);
+		free_extent_buffer(fs_info->quota_root->commit_root);
+	}
 
 	btrfs_free_block_groups(fs_info);
 

From 546adb0d817c34dc2be3a7cb5bba8771f837a562 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 14 Jun 2012 16:37:44 +0200
Subject: [PATCH 12/15] Btrfs: hooks for qgroup to record delayed refs

Hooks into qgroup code to record refs and into transaction commit.
This is the main entry point for qgroup. Basically every change in
extent backrefs got accounted to the appropriate qgroups.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/delayed-ref.c | 16 ++++++++++------
 fs/btrfs/delayed-ref.h | 19 +++++++++++++++++++
 fs/btrfs/transaction.c |  7 +++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 21a757717637..da7419ed01bb 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -529,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
-	if (is_fstree(ref_root))
-		seq = btrfs_inc_tree_mod_seq(fs_info);
+	if (need_ref_seq(for_cow, ref_root))
+		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -588,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
-	if (is_fstree(ref_root))
-		seq = btrfs_inc_tree_mod_seq(fs_info);
+	if (need_ref_seq(for_cow, ref_root))
+		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
 	ref->seq = seq;
 
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -662,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, level, action,
 				   for_cow);
-	if (!is_fstree(ref_root) &&
+	if (!need_ref_seq(for_cow, ref_root) &&
 	    waitqueue_active(&fs_info->tree_mod_seq_wait))
 		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
+	if (need_ref_seq(for_cow, ref_root))
+		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
 
 	return 0;
 }
@@ -711,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
 				   num_bytes, parent, ref_root, owner, offset,
 				   action, for_cow);
-	if (!is_fstree(ref_root) &&
+	if (!need_ref_seq(for_cow, ref_root) &&
 	    waitqueue_active(&fs_info->tree_mod_seq_wait))
 		wake_up(&fs_info->tree_mod_seq_wait);
 	spin_unlock(&delayed_refs->lock);
+	if (need_ref_seq(for_cow, ref_root))
+		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
 
 	return 0;
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 2b5cb27f9861..0d7c90c366b6 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -179,6 +179,25 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq);
 
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+	if (for_cow)
+		return 0;
+
+	if (rootid == BTRFS_FS_TREE_OBJECTID)
+		return 1;
+
+	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+
+	return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d20d2e24f8d2..21c768cb443f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -795,6 +795,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_dev_stats(trans, root->fs_info);
 	BUG_ON(ret);
 
+	ret = btrfs_run_qgroups(trans, root->fs_info);
+	BUG_ON(ret);
+
+	/* run_qgroups might have added some more refs */
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
+
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);

From c556723794b3487a79de1ecd6354975b1389f5ff Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 15:44:05 +0200
Subject: [PATCH 13/15] Btrfs: hooks to reserve qgroup space

Like block reserves, reserve a small piece of space on each
transaction start and for delalloc. These are the hooks that
can actually return EDQUOT to the user.
The amount of space reserved is tracked in the transaction
handle.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/extent-tree.c | 12 ++++++++++++
 fs/btrfs/transaction.c | 16 ++++++++++++++++
 fs/btrfs/transaction.h |  1 +
 3 files changed, 29 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c08337a83ace..2ce16f97730a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4565,6 +4565,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	csum_bytes = BTRFS_I(inode)->csum_bytes;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, num_bytes +
+					   nr_extents * root->leafsize);
+		if (ret)
+			return ret;
+	}
+
 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
@@ -4643,6 +4650,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
 				      btrfs_ino(inode), to_free, 0);
+	if (root->fs_info->quota_enabled) {
+		btrfs_qgroup_free(root, num_bytes +
+					dropped * root->leafsize);
+	}
+
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 21c768cb443f..f1e29fbd5317 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -295,6 +295,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
 	int ret;
+	u64 qgroup_reserved = 0;
 
 	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
 		return ERR_PTR(-EROFS);
@@ -313,6 +314,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items > 0 && root != root->fs_info->chunk_root) {
+		if (root->fs_info->quota_enabled &&
+		    is_fstree(root->root_key.objectid)) {
+			qgroup_reserved = num_items * root->leafsize;
+			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+			if (ret)
+				return ERR_PTR(ret);
+		}
+
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
 		ret = btrfs_block_rsv_add(root,
 					  &root->fs_info->trans_block_rsv,
@@ -351,6 +360,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->block_rsv = NULL;
 	h->orig_rsv = NULL;
 	h->aborted = 0;
+	h->qgroup_reserved = qgroup_reserved;
 	h->delayed_ref_elem.seq = 0;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 
@@ -524,6 +534,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	 * end_transaction. Subvolume quota depends on this.
 	 */
 	WARN_ON(trans->root != root);
+
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+
 	while (count < 2) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 16ba00842c38..2759e0572c5c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -50,6 +50,7 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
+	u64 qgroup_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;

From 5d13a37bd5327220e13329943d1228acfbe5934a Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 15:53:51 +0200
Subject: [PATCH 14/15] Btrfs: add qgroup ioctls

Ioctls to control the qgroup feature like adding and
removing qgroups and assigning qgroups.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ioctl.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |  27 +++++++
 2 files changed, 212 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0e92e5763005..55a7283a9e18 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3390,6 +3390,183 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
 	return ret;
 }
 
+static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_quota_ctl_args *sa;
+	struct btrfs_trans_handle *trans = NULL;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+	}
+
+	switch (sa->cmd) {
+	case BTRFS_QUOTA_CTL_ENABLE:
+		ret = btrfs_quota_enable(trans, root->fs_info);
+		break;
+	case BTRFS_QUOTA_CTL_DISABLE:
+		ret = btrfs_quota_disable(trans, root->fs_info);
+		break;
+	case BTRFS_QUOTA_CTL_RESCAN:
+		ret = btrfs_quota_rescan(root->fs_info);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	if (trans) {
+		err = btrfs_commit_transaction(trans, root);
+		if (err && !ret)
+			ret = err;
+	}
+
+out:
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_qgroup_assign_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->assign) {
+		ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	} else {
+		ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_qgroup_create_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->create) {
+		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+					  NULL);
+	} else {
+		ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_qgroup_limit_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+	u64 qgroupid;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	qgroupid = sa->qgroupid;
+	if (!qgroupid) {
+		/* take the current subvol as qgroup */
+		qgroupid = root->root_key.objectid;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3476,6 +3653,14 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_dev_stats(root, argp, 0);
 	case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
 		return btrfs_ioctl_get_dev_stats(root, argp, 1);
+	case BTRFS_IOC_QUOTA_CTL:
+		return btrfs_ioctl_quota_ctl(root, argp);
+	case BTRFS_IOC_QGROUP_ASSIGN:
+		return btrfs_ioctl_qgroup_assign(root, argp);
+	case BTRFS_IOC_QGROUP_CREATE:
+		return btrfs_ioctl_qgroup_create(root, argp);
+	case BTRFS_IOC_QGROUP_LIMIT:
+		return btrfs_ioctl_qgroup_limit(root, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index a8a2230f4c5c..9dd50c4656b3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -319,6 +319,25 @@ struct btrfs_ioctl_get_dev_stats {
 	__u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
 };
 
+#define BTRFS_QUOTA_CTL_ENABLE	1
+#define BTRFS_QUOTA_CTL_DISABLE	2
+#define BTRFS_QUOTA_CTL_RESCAN	3
+struct btrfs_ioctl_quota_ctl_args {
+	__u64 cmd;
+	__u64 status;
+};
+
+struct btrfs_ioctl_qgroup_assign_args {
+	__u64 assign;
+	__u64 src;
+	__u64 dst;
+};
+
+struct btrfs_ioctl_qgroup_create_args {
+	__u64 create;
+	__u64 qgroupid;
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -388,4 +407,12 @@ struct btrfs_ioctl_get_dev_stats {
 #define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
 					struct btrfs_ioctl_get_dev_stats)
 
+#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
+			       struct btrfs_ioctl_quota_ctl_args)
+#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
+			       struct btrfs_ioctl_qgroup_assign_args)
+#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
+			       struct btrfs_ioctl_qgroup_create_args)
+#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
+			       struct btrfs_ioctl_qgroup_limit_args)
 #endif

From 6f72c7e20dbaea55f04546de69586c84a3654503 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 14 Sep 2011 15:58:21 +0200
Subject: [PATCH 15/15] Btrfs: add qgroup inheritance

When creating a subvolume or snapshot, it is necessary
to initialize the qgroup account with a copy of some
other (tracking) qgroup. This patch adds parameters
to the ioctls to pass the information from which qgroup
to inherit.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ioctl.c       | 59 ++++++++++++++++++++++++++++++------------
 fs/btrfs/ioctl.h       | 11 +++++++-
 fs/btrfs/transaction.c |  8 ++++++
 fs/btrfs/transaction.h |  1 +
 4 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 55a7283a9e18..1dffd0adf975 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -336,7 +336,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 static noinline int create_subvol(struct btrfs_root *root,
 				  struct dentry *dentry,
 				  char *name, int namelen,
-				  u64 *async_transid)
+				  u64 *async_transid,
+				  struct btrfs_qgroup_inherit **inherit)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
@@ -368,6 +369,11 @@ static noinline int create_subvol(struct btrfs_root *root,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
+	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
+				   inherit ? *inherit : NULL);
+	if (ret)
+		goto fail;
+
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
 				      0, objectid, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
@@ -484,7 +490,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 			   char *name, int namelen, u64 *async_transid,
-			   bool readonly)
+			   bool readonly, struct btrfs_qgroup_inherit **inherit)
 {
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
@@ -502,6 +508,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
 	pending_snapshot->readonly = readonly;
+	if (inherit) {
+		pending_snapshot->inherit = *inherit;
+		*inherit = NULL;	/* take responsibility to free it */
+	}
 
 	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
 	if (IS_ERR(trans)) {
@@ -635,7 +645,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
-				   u64 *async_transid, bool readonly)
+				   u64 *async_transid, bool readonly,
+				   struct btrfs_qgroup_inherit **inherit)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
@@ -666,11 +677,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry,
-					name, namelen, async_transid, readonly);
+		error = create_snapshot(snap_src, dentry, name, namelen,
+					async_transid, readonly, inherit);
 	} else {
 		error = create_subvol(BTRFS_I(dir)->root, dentry,
-				      name, namelen, async_transid);
+				      name, namelen, async_transid, inherit);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -1379,11 +1390,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 }
 
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-						    char *name,
-						    unsigned long fd,
-						    int subvol,
-						    u64 *transid,
-						    bool readonly)
+				char *name, unsigned long fd, int subvol,
+				u64 *transid, bool readonly,
+				struct btrfs_qgroup_inherit **inherit)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct file *src_file;
@@ -1407,7 +1416,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 
 	if (subvol) {
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     NULL, transid, readonly);
+				     NULL, transid, readonly, inherit);
 	} else {
 		struct inode *src_inode;
 		src_file = fget(fd);
@@ -1426,7 +1435,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 		}
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
 				     BTRFS_I(src_inode)->root,
-				     transid, readonly);
+				     transid, readonly, inherit);
 		fput(src_file);
 	}
 out:
@@ -1446,7 +1455,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
 					      vol_args->fd, subvol,
-					      NULL, false);
+					      NULL, false, NULL);
 
 	kfree(vol_args);
 	return ret;
@@ -1460,6 +1469,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	u64 transid = 0;
 	u64 *ptr = NULL;
 	bool readonly = false;
+	struct btrfs_qgroup_inherit *inherit = NULL;
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
@@ -1467,7 +1477,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
 	if (vol_args->flags &
-	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+	      BTRFS_SUBVOL_QGROUP_INHERIT)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -1476,10 +1487,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 		ptr = &transid;
 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
 		readonly = true;
+	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+		if (vol_args->size > PAGE_CACHE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+		if (IS_ERR(inherit)) {
+			ret = PTR_ERR(inherit);
+			goto out;
+		}
+	}
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-					      vol_args->fd, subvol,
-					      ptr, readonly);
+					      vol_args->fd, subvol, ptr,
+					      readonly, &inherit);
 
 	if (ret == 0 && ptr &&
 	    copy_to_user(arg +
@@ -1488,6 +1510,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 		ret = -EFAULT;
 out:
 	kfree(vol_args);
+	kfree(inherit);
 	return ret;
 }
 
@@ -3588,6 +3611,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_snap_create_v2(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_SUBVOL_CREATE_V2:
+		return btrfs_ioctl_snap_create_v2(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_SUBVOL_GETFLAGS:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 9dd50c4656b3..cdda57f1c240 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -32,6 +32,7 @@ struct btrfs_ioctl_vol_args {
 
 #define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
 #define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
+#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_UUID_SIZE 16
 
@@ -64,7 +65,13 @@ struct btrfs_ioctl_vol_args_v2 {
 	__s64 fd;
 	__u64 transid;
 	__u64 flags;
-	__u64 unused[4];
+	union {
+		struct {
+			__u64 size;
+			struct btrfs_qgroup_inherit __user *qgroup_inherit;
+		};
+		__u64 unused[4];
+	};
 	char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 
@@ -382,6 +389,8 @@ struct btrfs_ioctl_qgroup_create_args {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
 				   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
+				   struct btrfs_ioctl_vol_args_v2)
 #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
 #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f1e29fbd5317..127283913a42 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -990,6 +990,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
+				   objectid, pending->inherit);
+	kfree(pending->inherit);
+	if (ret) {
+		pending->error = ret;
+		goto fail;
+	}
+
 	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2759e0572c5c..cca315dcdfcd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -73,6 +73,7 @@ struct btrfs_pending_snapshot {
 	struct dentry *dentry;
 	struct btrfs_root *root;
 	struct btrfs_root *snap;
+	struct btrfs_qgroup_inherit *inherit;
 	/* block reservation for the operation */
 	struct btrfs_block_rsv block_rsv;
 	/* extra metadata reseration for relocation */