diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8d9f3c911fca..ee041a01a67e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -188,6 +188,14 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all cgroups. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr. + * It's used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + /* * This is a copy of dentry->d_name, and it's needed because * we can't use dentry->d_name in cgroup_path(). @@ -675,6 +683,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +struct cgroup *cgroup_next_sibling(struct cgroup *pos); + /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 501974823b33..b87c7a5a5497 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2975,6 +2975,55 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_sibling - find the next sibling of a given cgroup + * @pos: the current cgroup + * + * This function returns the next sibling of @pos and should be called + * under RCU read lock. The only requirement is that @pos is accessible. + * The next sibling is guaranteed to be returned regardless of @pos's + * state. + */ +struct cgroup *cgroup_next_sibling(struct cgroup *pos) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* + * @pos could already have been removed. Once a cgroup is removed, + * its ->sibling.next is no longer updated when its next sibling + * changes. As CGRP_REMOVED is set on removal which is fully + * serialized, if we see it unasserted, it's guaranteed that the + * next sibling hasn't finished its grace period even if it's + * already removed, and thus safe to dereference from this RCU + * critical section. If ->sibling.next is inaccessible, + * cgroup_is_removed() is guaranteed to be visible as %true here. + */ + if (likely(!cgroup_is_removed(pos))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return next; + return NULL; + } + + /* + * Can't dereference the next pointer. Each cgroup is given a + * monotonically increasing unique serial number and always + * appended to the sibling list, so the next one can be found by + * walking the parent's children until we see a cgroup with higher + * serial number than @pos's. + * + * While this path can be slow, it's taken only when either the + * current cgroup is removed or iteration and removal race. + */ + list_for_each_entry_rcu(next, &pos->parent->children, sibling) + if (next->serial_nr > pos->serial_nr) + return next; + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_sibling); + /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) @@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0); struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); + /* + * Assign a monotonically increasing serial number. With the list + * appending below, it guarantees that sibling cgroups are always + * sorted in the ascending serial number order on the parent's + * ->children. + */ + cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor); + /* allocation complete, commit to creation */ list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); @@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * removed. This makes future css_tryget() and child creation * attempts fail thus maintaining the removal conditions verified * above. + * + * Note that CGRP_REMVOED clearing is depended upon by + * cgroup_next_sibling() to resume iteration after dropping RCU + * read lock. See cgroup_next_sibling() for details. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];