cgroups: mechanism to process each task in a cgroup

Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-07 00:14:42 -08:00 · 2008-02-07 00:14:42 -08:00 · 31a7df01fd
parent dfc05c259e
commit 31a7df01fd
2 changed files with 200 additions and 12 deletions
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@ -14,6 +14,7 @@
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #ifdef CONFIG_CGROUPS
@ -207,6 +208,14 @@ struct cftype {
 	int (*release) (struct inode *inode, struct file *file);
 };
 struct cgroup_scanner {
 	struct cgroup *cg;
 	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
 	void (*process_task)(struct task_struct *p,
 			struct cgroup_scanner *scan);
 	struct ptr_heap *heap;
 };
 /* Add a new file to the given cgroup directory. Should only be
 * called by subsystems from within a populate() method */
 int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
@ -299,11 +308,16 @@ struct cgroup_iter {
 *    returns NULL or until you want to end the iteration
 *
 * 3) call cgroup_iter_end() to destroy the iterator.
 *
 * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
 *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
 *      callback, but not while calling the process_task() callback.
 */
 void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cont,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 #else /* !CONFIG_CGROUPS */
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@ -1695,14 +1695,17 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
 	it->task = cg->tasks.next;
 }
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 /*
-	 * The first time anyone tries to iterate across a cgroup,
+ * To reduce the fork() overhead for systems that are not actually
-	 * we need to enable the list linking each css_set to its
+ * using their cgroups capability, we don't maintain the lists running
-	 * tasks, and fix up all existing tasks.
+ * through each css_set to its tasks until we see the list actually
 * used - in other words after the first call to cgroup_iter_start().
 *
 * The tasklist_lock is not held here, as do_each_thread() and
 * while_each_thread() are protected by RCU.
 */
-	if (!use_task_css_set_links) {
+void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	write_lock(&css_set_lock);
 	use_task_css_set_links = 1;
@ -1714,6 +1717,17 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 	} while_each_thread(g, p);
 	write_unlock(&css_set_lock);
 }
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	/*
 	 * The first time anyone tries to iterate across a cgroup,
 	 * we need to enable the list linking each css_set to its
 	 * tasks, and fix up all existing tasks.
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 	read_lock(&css_set_lock);
 	it->cg_link = &cgrp->css_sets;
 	cgroup_advance_iter(cgrp, it);
@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
 	read_unlock(&css_set_lock);
 }
 static inline int started_after_time(struct task_struct *t1,
 				     struct timespec *time,
 				     struct task_struct *t2)
 {
 	int start_diff = timespec_compare(&t1->start_time, time);
 	if (start_diff > 0) {
 		return 1;
 	} else if (start_diff < 0) {
 		return 0;
 	} else {
 		/*
 		 * Arbitrarily, if two processes started at the same
 		 * time, we'll say that the lower pointer value
 		 * started first. Note that t2 may have exited by now
 		 * so this may not be a valid pointer any longer, but
 		 * that's fine - it still serves to distinguish
 		 * between two tasks started (effectively) simultaneously.
 		 */
 		return t1 > t2;
 	}
 }
 /*
 * This function is a callback from heap_insert() and is used to order
 * the heap.
 * In this case we order the heap in descending task start time.
 */
 static inline int started_after(void *p1, void *p2)
 {
 	struct task_struct *t1 = p1;
 	struct task_struct *t2 = p2;
 	return started_after_time(t1, &t2->start_time, t2);
 }
 /**
 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
 * @scan: struct cgroup_scanner containing arguments for the scan
 *
 * Arguments include pointers to callback functions test_task() and
 * process_task().
 * Iterate through all the tasks in a cgroup, calling test_task() for each,
 * and if it returns true, call process_task() for it also.
 * The test_task pointer may be NULL, meaning always true (select all tasks).
 * Effectively duplicates cgroup_iter_{start,next,end}()
 * but does not lock css_set_lock for the call to process_task().
 * The struct cgroup_scanner may be embedded in any structure of the caller's
 * creation.
 * It is guaranteed that process_task() will act on every task that
 * is a member of the cgroup for the duration of this call. This
 * function may or may not call process_task() for tasks that exit
 * or move to a different cgroup during the call, or are forked or
 * move into the cgroup during the call.
 *
 * Note that test_task() may be called with locks held, and may in some
 * situations be called multiple times for the same task, so it should
 * be cheap.
 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
 * pre-allocated and will be used for heap operations (and its "gt" member will
 * be overwritten), else a temporary heap will be used (allocation of which
 * may cause this function to fail).
 */
 int cgroup_scan_tasks(struct cgroup_scanner *scan)
 {
 	int retval, i;
 	struct cgroup_iter it;
 	struct task_struct *p, *dropped;
 	/* Never dereference latest_task, since it's not refcounted */
 	struct task_struct *latest_task = NULL;
 	struct ptr_heap tmp_heap;
 	struct ptr_heap *heap;
 	struct timespec latest_time = { 0, 0 };
 	if (scan->heap) {
 		/* The caller supplied our heap and pre-allocated its memory */
 		heap = scan->heap;
 		heap->gt = &started_after;
 	} else {
 		/* We need to allocate our own heap memory */
 		heap = &tmp_heap;
 		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
 		if (retval)
 			/* cannot allocate the heap */
 			return retval;
 	}
 again:
 	/*
 	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
 	 * to determine which are of interest, and using the scanner's
 	 * "process_task" callback to process any of them that need an update.
 	 * Since we don't want to hold any locks during the task updates,
 	 * gather tasks to be processed in a heap structure.
 	 * The heap is sorted by descending task start time.
 	 * If the statically-sized heap fills up, we overflow tasks that
 	 * started later, and in future iterations only consider tasks that
 	 * started after the latest task in the previous pass. This
 	 * guarantees forward progress and that we don't miss any tasks.
 	 */
 	heap->size = 0;
 	cgroup_iter_start(scan->cg, &it);
 	while ((p = cgroup_iter_next(scan->cg, &it))) {
 		/*
 		 * Only affect tasks that qualify per the caller's callback,
 		 * if he provided one
 		 */
 		if (scan->test_task && !scan->test_task(p, scan))
 			continue;
 		/*
 		 * Only process tasks that started after the last task
 		 * we processed
 		 */
 		if (!started_after_time(p, &latest_time, latest_task))
 			continue;
 		dropped = heap_insert(heap, p);
 		if (dropped == NULL) {
 			/*
 			 * The new task was inserted; the heap wasn't
 			 * previously full
 			 */
 			get_task_struct(p);
 		} else if (dropped != p) {
 			/*
 			 * The new task was inserted, and pushed out a
 			 * different task
 			 */
 			get_task_struct(p);
 			put_task_struct(dropped);
 		}
 		/*
 		 * Else the new task was newer than anything already in
 		 * the heap and wasn't inserted
 		 */
 	}
 	cgroup_iter_end(scan->cg, &it);
 	if (heap->size) {
 		for (i = 0; i < heap->size; i++) {
 			struct task_struct *p = heap->ptrs[i];
 			if (i == 0) {
 				latest_time = p->start_time;
 				latest_task = p;
 			}
 			/* Process the task per the caller's callback */
 			scan->process_task(p, scan);
 			put_task_struct(p);
 		}
 		/*
 		 * If we had to process any tasks at all, scan again
 		 * in case some of them were in the middle of forking
 		 * children that didn't get processed.
 		 * Not the most efficient way to do it, but it avoids
 		 * having to take callback_mutex in the fork path
 		 */
 		goto again;
 	}
 	if (heap == &tmp_heap)
 		heap_free(&tmp_heap);
 	return 0;
 }
 /*
 * Stuff for reading the 'tasks' file.
 *