Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo: - cgroup.kill is added which implements atomic killing of the whole subtree. Down the line, this should be able to replace the multiple userland implementations of "keep killing till empty". - PSI can now be turned off at boot time to avoid overhead for configurations which don't care about PSI. * 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: make per-cgroup pressure stall tracking configurable cgroup: Fix kernel-doc cgroup: inline cgroup_task_freeze() tests/cgroup: test cgroup.kill tests/cgroup: move cg_wait_for(), cg_prepare_for_wait() tests/cgroup: use cgroup.kill in cg_killall() docs/cgroup: add entry for cgroup.kill cgroup: introduce cgroup.kill
2021-07-01 17:22:14 -07:00 · 2021-07-01 17:22:14 -07:00 · 3dbdb38e28
parent e267992f9e 3958e2d0c3
commit 3dbdb38e28
13 changed files with 569 additions and 108 deletions
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@ -953,6 +953,21 @@ All cgroup core files are prefixed with "cgroup."
 	it's possible to delete a frozen (and empty) cgroup, as well as
 	create new sub-cgroups.
  cgroup.kill
 	A write-only single value file which exists in non-root cgroups.
 	The only allowed value is "1".
 	Writing "1" to the file causes the cgroup and all descendant cgroups to
 	be killed. This means that all processes located in the affected cgroup
 	tree will be killed via SIGKILL.
 	Killing a cgroup tree will deal with concurrent forks appropriately and
 	is protected against migrations.
 	In a threaded cgroup, writing this file fails with EOPNOTSUPP as
 	killing cgroups is a process directed operation, i.e. it affects
 	the whole thread-group.
 Controllers
 ===========
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -497,16 +497,21 @@
 	ccw_timeout_log	[S390]
 			See Documentation/s390/common_io.rst for details.
-	cgroup_disable=	[KNL] Disable a particular controller
+	cgroup_disable=	[KNL] Disable a particular controller or optional feature
-			Format: {name of the controller(s) to disable}
+			Format: {name of the controller(s) or feature(s) to disable}
 			The effects of cgroup_disable=foo are:
 			- foo isn't auto-mounted if you mount all cgroups in
 			  a single hierarchy
 			- foo isn't visible as an individually mountable
 			  subsystem
 			- if foo is an optional feature then the feature is
 			  disabled and corresponding cgroup files are not
 			  created
 			{Currently only "memory" controller deal with this and
 			cut the overhead, others just disable the usage. So
 			only cgroup_disable=memory is actually worthy}
 			Specifying "pressure" disables per-cgroup pressure
 			stall information accounting feature
 	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
 			Format: { { controller | "all" | "named" }
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@ -71,6 +71,9 @@ enum {
 	/* Cgroup is frozen. */
 	CGRP_FROZEN,
 	/* Control group has to be killed. */
 	CGRP_KILL,
 };
 /* cgroup_root->flags */
@ -110,6 +113,7 @@ enum {
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
 	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return &cgrp->psi;
 }
 bool cgroup_psi_enabled(void);
 static inline void cgroup_init_kthreadd(void)
 {
 	/*
@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return NULL;
 }
 static inline bool cgroup_psi_enabled(void)
 {
 	return false;
 }
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
 {
@ -906,20 +913,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze);
 void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
 				 struct cgroup *dst);
 static inline bool cgroup_task_freeze(struct task_struct *task)
 {
 	bool ret;
 	if (task->flags & PF_KTHREAD)
 		return false;
 	rcu_read_lock();
 	ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
 	rcu_read_unlock();
 	return ret;
 }
 static inline bool cgroup_task_frozen(struct task_struct *task)
 {
 	return task->frozen;
@ -929,10 +922,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
 static inline void cgroup_enter_frozen(void) { }
 static inline void cgroup_leave_frozen(bool always_leave) { }
 static inline bool cgroup_task_freeze(struct task_struct *task)
 {
 	return false;
 }
 static inline bool cgroup_task_frozen(struct task_struct *task)
 {
 	return false;
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
 /* cgroup optional features */
 enum cgroup_opt_features {
 #ifdef CONFIG_PSI
 	OPT_FEATURE_PRESSURE,
 #endif
 	OPT_FEATURE_COUNT
 };
 static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
 #ifdef CONFIG_PSI
 	"pressure",
 #endif
 };
 static u16 cgroup_feature_disable_mask __read_mostly;
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 }
 /**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	psi_trigger_replace(&of->priv, NULL);
 }
 bool cgroup_psi_enabled(void)
 {
 	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
 }
 #else /* CONFIG_PSI */
 bool cgroup_psi_enabled(void)
 {
 	return false;
 }
 #endif /* CONFIG_PSI */
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 static void __cgroup_kill(struct cgroup *cgrp)
 {
 	struct css_task_iter it;
 	struct task_struct *task;
 	lockdep_assert_held(&cgroup_mutex);
 	spin_lock_irq(&css_set_lock);
 	set_bit(CGRP_KILL, &cgrp->flags);
 	spin_unlock_irq(&css_set_lock);
 	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
 	while ((task = css_task_iter_next(&it))) {
 		/* Ignore kernel threads here. */
 		if (task->flags & PF_KTHREAD)
 			continue;
 		/* Skip tasks that are already dying. */
 		if (__fatal_signal_pending(task))
 			continue;
 		send_sig(SIGKILL, task, 0);
 	}
 	css_task_iter_end(&it);
 	spin_lock_irq(&css_set_lock);
 	clear_bit(CGRP_KILL, &cgrp->flags);
 	spin_unlock_irq(&css_set_lock);
 }
 static void cgroup_kill(struct cgroup *cgrp)
 {
 	struct cgroup_subsys_state *css;
 	struct cgroup *dsct;
 	lockdep_assert_held(&cgroup_mutex);
 	cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
 		__cgroup_kill(dsct);
 }
 static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
 	ssize_t ret = 0;
 	int kill;
 	struct cgroup *cgrp;
 	ret = kstrtoint(strstrip(buf), 0, &kill);
 	if (ret)
 		return ret;
 	if (kill != 1)
 		return -ERANGE;
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENOENT;
 	/*
 	 * Killing is a process directed operation, i.e. the whole thread-group
 	 * is taken down so act like we do for cgroup.procs and only make this
 	 * writable in non-threaded cgroups.
 	 */
 	if (cgroup_is_threaded(cgrp))
 		ret = -EOPNOTSUPP;
 	else
 		cgroup_kill(cgrp);
 	cgroup_kn_unlock(of->kn);
 	return ret ?: nbytes;
 }
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of_cft(of);
@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 restart:
 	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
 		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
 			continue;
 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 		WARN_ON(cft->ss || cft->kf_ops);
 		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
 			continue;
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
@ -4860,6 +4967,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_freeze_show,
 		.write = cgroup_freeze_write,
 	},
 	{
 		.name = "cgroup.kill",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.write = cgroup_kill_write,
 	},
 	{
 		.name = "cpu.stat",
 		.seq_show = cpu_stat_show,
@ -4867,6 +4979,7 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
 		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "memory.pressure",
 		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
 		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
 		      struct kernel_clone_args *kargs)
 	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
 {
 	unsigned long cgrp_flags = 0;
 	bool kill = false;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	int i;
@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,
 	/* init tasks are special, only link regular threads */
 	if (likely(child->pid)) {
 		if (kargs->cgrp)
 			cgrp_flags = kargs->cgrp->flags;
 		else
 			cgrp_flags = cset->dfl_cgrp->flags;
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
 		css_set_move_task(child, NULL, cset, false);
@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child,
 		cset = NULL;
 	}
-	/*
+	if (!(child->flags & PF_KTHREAD)) {
-	 * If the cgroup has to be frozen, the new task has too.  Let's set
+		if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
-	 * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
+			/*
-	 * frozen state.
+			 * If the cgroup has to be frozen, the new task has
-	 */
+			 * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
-	if (unlikely(cgroup_task_freeze(child))) {
+			 * get the task into the frozen state.
-		spin_lock(&child->sighand->siglock);
+			 */
-		WARN_ON_ONCE(child->frozen);
+			spin_lock(&child->sighand->siglock);
-		child->jobctl |= JOBCTL_TRAP_FREEZE;
+			WARN_ON_ONCE(child->frozen);
-		spin_unlock(&child->sighand->siglock);
+			child->jobctl |= JOBCTL_TRAP_FREEZE;
 			spin_unlock(&child->sighand->siglock);
 			/*
 			 * Calling cgroup_update_frozen() isn't required here,
 			 * because it will be called anyway a bit later from
 			 * do_freezer_trap(). So we avoid cgroup's transient
 			 * switch from the frozen state and back.
 			 */
 		}
 		/*
-		 * Calling cgroup_update_frozen() isn't required here,
+		 * If the cgroup is to be killed notice it now and take the
-		 * because it will be called anyway a bit later from
+		 * child down right after we finished preparing it for
-		 * do_freezer_trap(). So we avoid cgroup's transient switch
+		 * userspace.
 		 * from the frozen state and back.
 		 */
 		kill = test_bit(CGRP_KILL, &cgrp_flags);
 	}
 	spin_unlock_irq(&css_set_lock);
@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
 		put_css_set(rcset);
 	}
 	/* Cgroup has to be killed so take down child immediately. */
 	if (unlikely(kill))
 		do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
 	cgroup_css_set_put_fork(kargs);
 }
@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
 	cset->nr_tasks--;
 	WARN_ON_ONCE(cgroup_task_frozen(tsk));
-	if (unlikely(cgroup_task_freeze(tsk)))
+	if (unlikely(!(tsk->flags & PF_KTHREAD) &&
 		     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
 		cgroup_update_frozen(task_dfl_cgroup(tsk));
 	spin_unlock_irq(&css_set_lock);
@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
 			pr_info("Disabling %s control group subsystem\n",
 				ss->name);
 		}
 		for (i = 0; i < OPT_FEATURE_COUNT; i++) {
 			if (strcmp(token, cgroup_opt_feature_names[i]))
 				continue;
 			cgroup_feature_disable_mask |= 1 << i;
 			pr_info("Disabling %s control group feature\n",
 				cgroup_opt_feature_names[i]);
 			break;
 		}
 	}
 	return 1;
 }
@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
 		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
 			continue;
 		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
 			continue;
 		if (prefix)
 			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 }
 /**
- * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@ -148,6 +148,7 @@
 static int psi_bug __read_mostly;
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
 DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
 static bool psi_enable;
@ -215,6 +216,9 @@ void __init psi_init(void)
 		return;
 	}
 	if (!cgroup_psi_enabled())
 		static_branch_disable(&psi_cgroups_enabled);
 	psi_period = jiffies_to_nsecs(PSI_FREQ);
 	group_init(&psi_system);
 }
@ -748,23 +752,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 {
 	if (*iter == &psi_system)
 		return NULL;
 #ifdef CONFIG_CGROUPS
-	struct cgroup *cgroup = NULL;
+	if (static_branch_likely(&psi_cgroups_enabled)) {
 		struct cgroup *cgroup = NULL;
-	if (!*iter)
+		if (!*iter)
-		cgroup = task->cgroups->dfl_cgrp;
+			cgroup = task->cgroups->dfl_cgrp;
-	else if (*iter == &psi_system)
+		else
-		return NULL;
+			cgroup = cgroup_parent(*iter);
 	else
 		cgroup = cgroup_parent(*iter);
-	if (cgroup && cgroup_parent(cgroup)) {
+		if (cgroup && cgroup_parent(cgroup)) {
-		*iter = cgroup;
+			*iter = cgroup;
-		return cgroup_psi(cgroup);
+			return cgroup_psi(cgroup);
 		}
 	}
 #else
 	if (*iter)
 		return NULL;
 #endif
 	*iter = &psi_system;
 	return &psi_system;
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@ -2,4 +2,5 @@
 test_memcontrol
 test_core
 test_freezer
-test_kmem
+test_kmem
 test_kill
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@ -9,6 +9,7 @@ TEST_GEN_PROGS = test_memcontrol
 TEST_GEN_PROGS += test_kmem
 TEST_GEN_PROGS += test_core
 TEST_GEN_PROGS += test_freezer
 TEST_GEN_PROGS += test_kill
 include ../lib.mk
@ -16,3 +17,4 @@ $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
 $(OUTPUT)/test_kill: cgroup_util.c ../clone3/clone3_selftests.h ../pidfd/pidfd.h
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@ -5,10 +5,12 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/limits.h>
 #include <poll.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/inotify.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/wait.h>
@ -252,6 +254,10 @@ int cg_killall(const char *cgroup)
 	char buf[PAGE_SIZE];
 	char *ptr = buf;
 	/* If cgroup.kill exists use it. */
 	if (!cg_write(cgroup, "cgroup.kill", "1"))
 		return 0;
 	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
 		return -1;
@ -576,3 +582,48 @@ int clone_into_cgroup_run_wait(const char *cgroup)
 	(void)clone_reap(pid, WEXITED);
 	return 0;
 }
 int cg_prepare_for_wait(const char *cgroup)
 {
 	int fd, ret = -1;
 	fd = inotify_init1(0);
 	if (fd == -1)
 		return fd;
 	ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
 				IN_MODIFY);
 	if (ret == -1) {
 		close(fd);
 		fd = -1;
 	}
 	return fd;
 }
 int cg_wait_for(int fd)
 {
 	int ret = -1;
 	struct pollfd fds = {
 		.fd = fd,
 		.events = POLLIN,
 	};
 	while (true) {
 		ret = poll(&fds, 1, 10000);
 		if (ret == -1) {
 			if (errno == EINTR)
 				continue;
 			break;
 		}
 		if (ret > 0 && fds.revents & POLLIN) {
 			ret = 0;
 			break;
 		}
 	}
 	return ret;
 }
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@ -54,3 +54,5 @@ extern pid_t clone_into_cgroup(int cgroup_fd);
 extern int clone_reap(pid_t pid, int options);
 extern int clone_into_cgroup_run_wait(const char *cgroup);
 extern int dirfd_open_opath(const char *dir);
 extern int cg_prepare_for_wait(const char *cgroup);
 extern int cg_wait_for(int fd);
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@ -7,9 +7,7 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <errno.h>
 #include <poll.h>
 #include <stdlib.h>
 #include <sys/inotify.h>
 #include <string.h>
 #include <sys/wait.h>
@ -54,61 +52,6 @@ static int cg_freeze_nowait(const char *cgroup, bool freeze)
 	return cg_write(cgroup, "cgroup.freeze", freeze ? "1" : "0");
 }
 /*
 * Prepare for waiting on cgroup.events file.
 */
 static int cg_prepare_for_wait(const char *cgroup)
 {
 	int fd, ret = -1;
 	fd = inotify_init1(0);
 	if (fd == -1) {
 		debug("Error: inotify_init1() failed\n");
 		return fd;
 	}
 	ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
 				IN_MODIFY);
 	if (ret == -1) {
 		debug("Error: inotify_add_watch() failed\n");
 		close(fd);
 		fd = -1;
 	}
 	return fd;
 }
 /*
 * Wait for an event. If there are no events for 10 seconds,
 * treat this an error.
 */
 static int cg_wait_for(int fd)
 {
 	int ret = -1;
 	struct pollfd fds = {
 		.fd = fd,
 		.events = POLLIN,
 	};
 	while (true) {
 		ret = poll(&fds, 1, 10000);
 		if (ret == -1) {
 			if (errno == EINTR)
 				continue;
 			debug("Error: poll() failed\n");
 			break;
 		}
 		if (ret > 0 && fds.revents & POLLIN) {
 			ret = 0;
 			break;
 		}
 	}
 	return ret;
 }
 /*
 * Attach a task to the given cgroup and wait for a cgroup frozen event.
 * All transient events (e.g. populated) are ignored.
--- a/tools/testing/selftests/cgroup/test_kill.c
+++ b/tools/testing/selftests/cgroup/test_kill.c
@ -0,0 +1,297 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <errno.h>
 #include <linux/limits.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include "../kselftest.h"
 #include "../pidfd/pidfd.h"
 #include "cgroup_util.h"
 /*
 * Kill the given cgroup and wait for the inotify signal.
 * If there are no events in 10 seconds, treat this as an error.
 * Then check that the cgroup is in the desired state.
 */
 static int cg_kill_wait(const char *cgroup)
 {
 	int fd, ret = -1;
 	fd = cg_prepare_for_wait(cgroup);
 	if (fd < 0)
 		return fd;
 	ret = cg_write(cgroup, "cgroup.kill", "1");
 	if (ret)
 		goto out;
 	ret = cg_wait_for(fd);
 	if (ret)
 		goto out;
 out:
 	close(fd);
 	return ret;
 }
 /*
 * A simple process running in a sleep loop until being
 * re-parented.
 */
 static int child_fn(const char *cgroup, void *arg)
 {
 	int ppid = getppid();
 	while (getppid() == ppid)
 		usleep(1000);
 	return getppid() == ppid;
 }
 static int test_cgkill_simple(const char *root)
 {
 	pid_t pids[100];
 	int ret = KSFT_FAIL;
 	char *cgroup = NULL;
 	int i;
 	cgroup = cg_name(root, "cg_test_simple");
 	if (!cgroup)
 		goto cleanup;
 	if (cg_create(cgroup))
 		goto cleanup;
 	for (i = 0; i < 100; i++)
 		pids[i] = cg_run_nowait(cgroup, child_fn, NULL);
 	if (cg_wait_for_proc_count(cgroup, 100))
 		goto cleanup;
 	if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n"))
 		goto cleanup;
 	if (cg_kill_wait(cgroup))
 		goto cleanup;
 	ret = KSFT_PASS;
 cleanup:
 	for (i = 0; i < 100; i++)
 		wait_for_pid(pids[i]);
 	if (ret == KSFT_PASS &&
 	    cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
 		ret = KSFT_FAIL;
 	if (cgroup)
 		cg_destroy(cgroup);
 	free(cgroup);
 	return ret;
 }
 /*
 * The test creates the following hierarchy:
 *       A
 *    / / \ \
 *   B  E  I K
 *  /\  |
 * C  D F
 *      |
 *      G
 *      |
 *      H
 *
 * with a process in C, H and 3 processes in K.
 * Then it tries to kill the whole tree.
 */
 static int test_cgkill_tree(const char *root)
 {
 	pid_t pids[5];
 	char *cgroup[10] = {0};
 	int ret = KSFT_FAIL;
 	int i;
 	cgroup[0] = cg_name(root, "cg_test_tree_A");
 	if (!cgroup[0])
 		goto cleanup;
 	cgroup[1] = cg_name(cgroup[0], "B");
 	if (!cgroup[1])
 		goto cleanup;
 	cgroup[2] = cg_name(cgroup[1], "C");
 	if (!cgroup[2])
 		goto cleanup;
 	cgroup[3] = cg_name(cgroup[1], "D");
 	if (!cgroup[3])
 		goto cleanup;
 	cgroup[4] = cg_name(cgroup[0], "E");
 	if (!cgroup[4])
 		goto cleanup;
 	cgroup[5] = cg_name(cgroup[4], "F");
 	if (!cgroup[5])
 		goto cleanup;
 	cgroup[6] = cg_name(cgroup[5], "G");
 	if (!cgroup[6])
 		goto cleanup;
 	cgroup[7] = cg_name(cgroup[6], "H");
 	if (!cgroup[7])
 		goto cleanup;
 	cgroup[8] = cg_name(cgroup[0], "I");
 	if (!cgroup[8])
 		goto cleanup;
 	cgroup[9] = cg_name(cgroup[0], "K");
 	if (!cgroup[9])
 		goto cleanup;
 	for (i = 0; i < 10; i++)
 		if (cg_create(cgroup[i]))
 			goto cleanup;
 	pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL);
 	pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL);
 	pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL);
 	pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL);
 	pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL);
 	/*
 	 * Wait until all child processes will enter
 	 * corresponding cgroups.
 	 */
 	if (cg_wait_for_proc_count(cgroup[2], 1) ||
 	    cg_wait_for_proc_count(cgroup[7], 1) ||
 	    cg_wait_for_proc_count(cgroup[9], 3))
 		goto cleanup;
 	/*
 	 * Kill A and check that we get an empty notification.
 	 */
 	if (cg_kill_wait(cgroup[0]))
 		goto cleanup;
 	ret = KSFT_PASS;
 cleanup:
 	for (i = 0; i < 5; i++)
 		wait_for_pid(pids[i]);
 	if (ret == KSFT_PASS &&
 	    cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
 		ret = KSFT_FAIL;
 	for (i = 9; i >= 0 && cgroup[i]; i--) {
 		cg_destroy(cgroup[i]);
 		free(cgroup[i]);
 	}
 	return ret;
 }
 static int forkbomb_fn(const char *cgroup, void *arg)
 {
 	int ppid;
 	fork();
 	fork();
 	ppid = getppid();
 	while (getppid() == ppid)
 		usleep(1000);
 	return getppid() == ppid;
 }
 /*
 * The test runs a fork bomb in a cgroup and tries to kill it.
 */
 static int test_cgkill_forkbomb(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *cgroup = NULL;
 	pid_t pid = -ESRCH;
 	cgroup = cg_name(root, "cg_forkbomb_test");
 	if (!cgroup)
 		goto cleanup;
 	if (cg_create(cgroup))
 		goto cleanup;
 	pid = cg_run_nowait(cgroup, forkbomb_fn, NULL);
 	if (pid < 0)
 		goto cleanup;
 	usleep(100000);
 	if (cg_kill_wait(cgroup))
 		goto cleanup;
 	if (cg_wait_for_proc_count(cgroup, 0))
 		goto cleanup;
 	ret = KSFT_PASS;
 cleanup:
 	if (pid > 0)
 		wait_for_pid(pid);
 	if (ret == KSFT_PASS &&
 	    cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
 		ret = KSFT_FAIL;
 	if (cgroup)
 		cg_destroy(cgroup);
 	free(cgroup);
 	return ret;
 }
 #define T(x) { x, #x }
 struct cgkill_test {
 	int (*fn)(const char *root);
 	const char *name;
 } tests[] = {
 	T(test_cgkill_simple),
 	T(test_cgkill_tree),
 	T(test_cgkill_forkbomb),
 };
 #undef T
 int main(int argc, char *argv[])
 {
 	char root[PATH_MAX];
 	int i, ret = EXIT_SUCCESS;
 	if (cg_find_unified_root(root, sizeof(root)))
 		ksft_exit_skip("cgroup v2 isn't mounted\n");
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		switch (tests[i].fn(root)) {
 		case KSFT_PASS:
 			ksft_test_result_pass("%s\n", tests[i].name);
 			break;
 		case KSFT_SKIP:
 			ksft_test_result_skip("%s\n", tests[i].name);
 			break;
 		default:
 			ret = EXIT_FAILURE;
 			ksft_test_result_fail("%s\n", tests[i].name);
 			break;
 		}
 	}
 	return ret;
 }